This commit is contained in:
feiga 2015-10-14 12:16:19 +08:00
Родитель e1ec448d22
Коммит 82d261863c
25 изменённых файлов: 2941 добавлений и 0 удалений

3
.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,3 @@
bin/
multiverso
src/*.o

45
Makefile Normal file
Просмотреть файл

@ -0,0 +1,45 @@
PROJECT := $(shell readlink $(dir $(lastword $(MAKEFILE_LIST))) -f)
CXX = g++
CXXFLAGS = -O3 \
-std=c++11 \
-Wall \
-Wno-sign-compare \
-fno-omit-frame-pointer
MULTIVERSO_DIR = $(PROJECT)/multiverso
MULTIVERSO_INC = $(MULTIVERSO_DIR)/include/multiverso
MULTIVERSO_LIB = $(MULTIVERSO_DIR)/lib
THIRD_PARTY_LIB = $(MULTIVERSO_DIR)/third_party/lib
INC_FLAGS = -I$(MULTIVERSO_INC)
LD_FLAGS = -L$(MULTIVERSO_LIB) -lmultiverso
LD_FLAGS += -L$(THIRD_PARTY_LIB) -lzmq -lmpi -lmpl
WORD_EMBEDDING_HEADERS = $(shell find $(PROJECT)/src -type f -name "*.h")
WORD_EMBEDDING_SRC = $(shell find $(PROJECT)/src -type f -name "*.cpp")
WORD_EMBEDDING_OBJ = $(WORD_EMBEDDING_SRC:.cpp=.o)
BIN_DIR = $(PROJECT)/bin
WORD_EMBEDDING = $(BIN_DIR)/multisense_word_embedding
all: path \
multisense_word_embedding
path: $(BIN_DIR)
$(BIN_DIR):
mkdir -p $@
$(WORD_EMBEDDING): $(WORD_EMBEDDING_OBJ)
$(CXX) $(WORD_EMBEDDING_OBJ) $(CXXFLAGS) $(INC_FLAGS) $(LD_FLAGS) -o $@
$(WORD_EMBEDDING_OBJ): %.o: %.cpp $(WORD_EMBEDDING_HEADERS) $(MULTIVERSO_INC)
$(CXX) $(CXXFLAGS) $(INC_FLAGS) -c $< -o $@
multisense_word_embedding: path $(WORD_EMBEDDING)
clean:
rm -rf $(BIN_DIR) $(WORD_EMBEDDING_OBJ)
.PHONY: all path multisense_word_embedding clean

12
build.sh Normal file
Просмотреть файл

@ -0,0 +1,12 @@
# build word_embedding
git clone https://github.com/msraai/multiverso
cd multiverso
cd third_party
sh install.sh
cd ..
make -j4 all
cd ..
make -j4

60
src/DataBlock.cpp Normal file
Просмотреть файл

@ -0,0 +1,60 @@
#include "DataBlock.h"
size_t DataBlock::Size()
{
return m_sentences.size();
}
void DataBlock::Add(int *head, int sentence_length, int64_t word_count, uint64_t next_random)
{
Sentence sentence(head, sentence_length, word_count, next_random);
m_sentences.push_back(sentence);
}
void DataBlock::UpdateNextRandom()
{
for (int i = 0; i < m_sentences.size(); ++i)
m_sentences[i].next_random *= (uint64_t)rand();
}
void DataBlock::Get(int index, int* &head, int &sentence_length, int64_t &word_count, uint64_t &next_random)
{
if (index >= 0 && index < m_sentences.size())
{
m_sentences[index].Get(head, sentence_length, word_count, next_random);
}
else
{
head = nullptr;
sentence_length = 0;
word_count = 0;
next_random = 0;
}
}
void DataBlock::ReleaseSentences()
{
for (int i = 0; i < m_sentences.size(); ++i)
delete m_sentences[i].head;
m_sentences.clear();
}
void DataBlock::AddTable(int table_id)
{
m_tables.push_back(table_id);
}
std::vector<int> & DataBlock::GetTables()
{
return m_tables;
}
void DataBlock::SetEpochId(const int epoch_id)
{
m_epoch_id = epoch_id;
}
int DataBlock::GetEpochId()
{
return m_epoch_id;
}

76
src/DataBlock.h Normal file
Просмотреть файл

@ -0,0 +1,76 @@
#pragma once
/*!
* \file DataBlock.h
* \brief Defines class DataBlock to store the necessary data for trainer and param_loader
* \author
* - v-fetia
*/
#include "Util.h"
#include <multiverso.h>
#include "HuffmanEncoder.h"
/*!
* \brief The class DataBlock stores train for trainer and param_loader
*/
class DataBlock : public multiverso::DataBlockBase
{
public:
/*!
* \brief Get the number of sentences stored in DataBlock
* \return the number of sentences
*/
size_t Size();
/*!
* \brief Add a new sentence to the DataBlock
* \param sentence the starting address of the sentence
* \param sentence_length the length of the sentence
* \param word_count the number of words when getting the sentence from train-file
* \param next_random the seed for getting random number
*/
void Add(int *sentence, int sentence_length, int64_t word_count, uint64_t next_random);
/*!
* \brief Get the information of the index-th sentence
* \param index the id of the sentence
* \param sentence the starting address of the sentence
* \param sentence_length the number of words in the sentence
* \param word_count the number of words when getting the sentence from train-file
* \param next_random the seed for getting random number
*/
void Get(int index, int* &sentence, int &sentence_length, int64_t &word_count, uint64_t &next_random);
void UpdateNextRandom();
void AddTable(int table_id);
std::vector <int> & GetTables();
void ReleaseSentences();
int GetEpochId();
void SetEpochId(const int epoch_id);
private:
struct Sentence
{
int* head;
int length;
int64_t word_count;
uint64_t next_random;
Sentence(int *head, int length, int64_t word_count, uint64_t next_random)
:head(head), length(length), word_count(word_count), next_random(next_random){}
void Get(int* &local_head, int &sentence_length, int64_t &local_word_count, uint64_t &local_next_random)
{
local_head = head;
sentence_length = length;
local_word_count = word_count;
local_next_random = next_random;
}
};
std::vector <int> m_tables;
std::vector <Sentence> m_sentences;
int m_epoch_id;
};

210
src/Dictionary.cpp Normal file
Просмотреть файл

@ -0,0 +1,210 @@
#include "Dictionary.h"
Dictionary::Dictionary()
{
combine =0;
Clear();
}
Dictionary::Dictionary(int i)
{
combine = i;
Clear();
}
void Dictionary::Clear()
{
m_word_idx_map.clear();
m_word_info.clear();
m_word_whitelist.clear();
}
void Dictionary::SetWhiteList(const std::vector<std::string>& whitelist)
{
for (unsigned int i = 0; i < whitelist.size(); ++i)
m_word_whitelist.insert(whitelist[i]);
}
void Dictionary::MergeInfrequentWords(int64_t threshold)
{
m_word_idx_map.clear();
std::vector<WordInfo> tmp_info;
tmp_info.clear();
int infreq_idx = -1;
for (auto& word_info : m_word_info)
{
if (word_info.freq >= threshold || word_info.freq == 0 || m_word_whitelist.count(word_info.word))
{
m_word_idx_map[word_info.word] = static_cast<int>(tmp_info.size());
tmp_info.push_back(word_info);
}
else {
if (infreq_idx < 0)
{
WordInfo infreq_word_info;
infreq_word_info.word = "WE_ARE_THE_INFREQUENT_WORDS";
infreq_word_info.freq = 0;
m_word_idx_map[infreq_word_info.word] = static_cast<int>(tmp_info.size());
infreq_idx = static_cast<int>(tmp_info.size());
tmp_info.push_back(infreq_word_info);
}
m_word_idx_map[word_info.word] = infreq_idx;
tmp_info[infreq_idx].freq += word_info.freq;
}
}
m_word_info = tmp_info;
}
void Dictionary::RemoveWordsLessThan(int64_t min_count)
{
m_word_idx_map.clear();
std::vector<WordInfo> tmp_info;
tmp_info.clear();
for (auto& info : m_word_info)
{
if (info.freq >= min_count || info.freq == 0 || m_word_whitelist.count(info.word))
{
m_word_idx_map[info.word] = static_cast<int>(tmp_info.size());
tmp_info.push_back(info);
}
}
m_word_info = tmp_info;
}
void Dictionary::Insert(const char* word, int64_t cnt)
{
const auto& it = m_word_idx_map.find(word);
if (it != m_word_idx_map.end())
m_word_info[it->second].freq += cnt;
else
{
m_word_idx_map[word] = static_cast<int>(m_word_info.size());
m_word_info.push_back(WordInfo(word, cnt));
}
}
void Dictionary::LoadFromFile(const char* filename)
{
FILE* fid = fopen(filename, "r");
if(fid)
{
char sz_label[MAX_WORD_SIZE];
while (fscanf(fid, "%s", sz_label, MAX_WORD_SIZE) != EOF)
{
int freq;
fscanf(fid, "%d", &freq);
Insert(sz_label, freq);
}
fclose(fid);
}
}
void Dictionary::LoadTriLetterFromFile(const char* filename, unsigned int min_cnt, unsigned int letter_count)
{
FILE* fid = fopen(filename, "r");
if(fid)
{
char sz_label[MAX_WORD_SIZE];
while (fscanf(fid, "%s", sz_label, MAX_WORD_SIZE) != EOF)
{
int freq;
fscanf(fid, "%d", &freq);
if (static_cast<unsigned int>(freq) < min_cnt) continue;
// Construct Tri-letter From word
size_t len = strlen(sz_label);
if (len > MAX_WORD_SIZE)
{
printf("ignore super long term");
continue;
}
char tri_letters[MAX_WORD_SIZE + 2];
tri_letters[0] = '#';
int i = 0;
for (i = 0; i < strlen(sz_label); i++)
{
tri_letters[i+1] = sz_label[i];
}
tri_letters[i+1] = '#';
tri_letters[i+2] = 0;
if (combine) Insert(sz_label,freq);
if (strlen(tri_letters) <= letter_count) {
Insert(tri_letters, freq);
} else {
for (i = 0; i <= strlen(tri_letters) - letter_count; ++i)
{
char tri_word[MAX_WORD_SIZE];
unsigned int j = 0;
for(j = 0; j < letter_count; j++)
{
tri_word[j] = tri_letters[i+j];
}
tri_word[j] = 0;
Insert(tri_word, freq);
}
}
}
fclose(fid);
}
}
int Dictionary::GetWordIdx(const char* word)
{
const auto& it = m_word_idx_map.find(word);
if (it != m_word_idx_map.end())
return it->second;
return -1;
}
int Dictionary::Size()
{
return static_cast<int>(m_word_info.size());
}
const WordInfo* Dictionary::GetWordInfo(const char* word)
{
const auto& it = m_word_idx_map.find(word);
if (it != m_word_idx_map.end())
return GetWordInfo(it->second);
return NULL;
}
const WordInfo* Dictionary::GetWordInfo(int word_idx)
{
if (word_idx >= 0 && word_idx < m_word_info.size())
return &m_word_info[word_idx];
return NULL;
}
void Dictionary::StartIteration()
{
m_word_iterator = m_word_info.begin();
}
bool Dictionary::HasMore()
{
return m_word_iterator != m_word_info.end();
}
const WordInfo* Dictionary::Next()
{
const WordInfo* entry = &(*m_word_iterator);
++m_word_iterator;
return entry;
}
std::vector<WordInfo>::iterator Dictionary::Begin()
{
return m_word_info.begin();
}
std::vector<WordInfo>::iterator Dictionary::End()
{
return m_word_info.end();
}

55
src/Dictionary.h Normal file
Просмотреть файл

@ -0,0 +1,55 @@
#pragma once
#include <string>
#include <vector>
#include <unordered_map>
#include <unordered_set>
#include "Util.h"
const int MAX_WORD_SIZE = 901;
struct WordInfo
{
std::string word;
int64_t freq;
WordInfo()
{
freq = 0;
word.clear();
}
WordInfo(const std::string& _word, int64_t _freq)
{
word = _word;
freq = _freq;
}
};
class Dictionary
{
public:
Dictionary();
Dictionary(int i);
void Clear();
void SetWhiteList(const std::vector<std::string>& whitelist);
void RemoveWordsLessThan(int64_t min_count);
void MergeInfrequentWords(int64_t threshold);
void Insert(const char* word, int64_t cnt = 1);
void LoadFromFile(const char* filename);
void LoadTriLetterFromFile(const char* filename, unsigned int min_cnt = 1, unsigned int letter_count = 3);
int GetWordIdx(const char* word);
const WordInfo* GetWordInfo(const char* word);
const WordInfo* GetWordInfo(int word_idx);
int Size();
void StartIteration();
bool HasMore();
const WordInfo* Next();
std::vector<WordInfo>::iterator Begin();
std::vector<WordInfo>::iterator End();
private:
int combine;
std::vector<WordInfo> m_word_info;
std::vector<WordInfo>::iterator m_word_iterator;
std::unordered_map<std::string, int> m_word_idx_map;
std::unordered_set<std::string> m_word_whitelist;
};

266
src/HuffmanEncoder.cpp Normal file
Просмотреть файл

@ -0,0 +1,266 @@
#include "HuffmanEncoder.h"
#include <algorithm>
#include <assert.h>
HuffmanEncoder::HuffmanEncoder()
{
m_dict = NULL;
}
void HuffmanEncoder::Save2File(const char* filename)
{
FILE* fid = fopen(filename, "w");
if(fid)
{
fprintf(fid, "%lld\n", m_hufflabel_info.size());
for (unsigned i = 0; i < m_hufflabel_info.size(); ++i)
{
const auto& info = m_hufflabel_info[i];
const auto& word = m_dict->GetWordInfo(i);
fprintf(fid, "%s %d", word->word.c_str(), info.codelen);
for (int j = 0; j < info.codelen; ++j)
fprintf(fid, " %d", info.code[j]);
for (int j = 0; j < info.codelen; ++j)
fprintf(fid, " %d", info.point[j]);
fprintf(fid, "\n");
}
fclose(fid);
}
else
{
printf("file open failed %s", filename);
}
}
void HuffmanEncoder::RecoverFromFile(const char* filename)
{
m_dict = new Dictionary();
FILE* fid = fopen(filename, "r");
if(fid)
{
int vocab_size;
fscanf(fid, "%lld", &vocab_size);
m_hufflabel_info.reserve(vocab_size);
m_hufflabel_info.clear();
int tmp;
char sz_label[MAX_WORD_SIZE];
for (int i = 0; i < vocab_size; ++i)
{
HuffLabelInfo info;
fscanf(fid, "%s", sz_label, MAX_WORD_SIZE);
m_dict->Insert(sz_label);
fscanf(fid, "%d", &info.codelen);
info.code.clear();
info.point.clear();
for (int j = 0; j < info.codelen; ++j)
{
fscanf(fid, "%d", &tmp);
info.code.push_back(tmp);
}
for (int j = 0; j < info.codelen; ++j)
{
fscanf(fid, "%d", &tmp);
info.point.push_back(tmp);
}
m_hufflabel_info.push_back(info);
}
fclose(fid);
}
else
{
printf("file open failed %s", filename);
}
}
bool compare(const std::pair<int, int64_t>& x, const std::pair<int, int64_t>& y)
{
if (x.second == 0) return true;
if (y.second == 0) return false;
return (x.second > y.second);
}
void HuffmanEncoder::BuildHuffmanTreeFromDict()
{
std::vector<std::pair<int, int64_t> > ordered_words;
ordered_words.reserve(m_dict->Size());
ordered_words.clear();
for (unsigned i = 0; i < static_cast<unsigned>(m_dict->Size()); ++i)
ordered_words.push_back(std::pair<int, int64_t>(i, m_dict->GetWordInfo(i)->freq));
std::sort(ordered_words.begin(), ordered_words.end(), compare);
unsigned vocab_size = (unsigned) ordered_words.size();
int64_t *count = new int64_t[vocab_size * 2 + 1]; //frequence
unsigned *binary = new unsigned[vocab_size * 2 + 1]; //huffman code relative to parent node [1,0] of each node
memset(binary, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
unsigned *parent_node = new unsigned[vocab_size * 2 + 1]; //
memset(parent_node, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
unsigned code[MAX_CODE_LENGTH], point[MAX_CODE_LENGTH];
for (unsigned i = 0; i < vocab_size; ++i)
count[i] = ordered_words[i].second;
for (unsigned i = vocab_size; i < vocab_size * 2; i++)
count[i] = static_cast<int64_t>(1e15);
int pos1 = vocab_size - 1;
int pos2 = vocab_size;
int min1i, min2i;
for (unsigned i = 0; i < vocab_size - 1; i++)
{
// First, find two smallest nodes 'min1, min2'
assert(pos2 < vocab_size * 2 - 1);
//find the samllest node
if (pos1 >= 0)
{
if (count[pos1] < count[pos2])
{
min1i = pos1;
pos1--;
}
else
{
min1i = pos2;
pos2++;
}
}
else
{
min1i = pos2;
pos2++;
}
//find the second samllest node
if (pos1 >= 0)
{
if (count[pos1] < count[pos2])
{
min2i = pos1;
pos1--;
}
else
{
min2i = pos2;
pos2++;
}
}
else
{
min2i = pos2;
pos2++;
}
count[vocab_size + i] = count[min1i] + count[min2i];
assert(min1i >= 0 && min1i < vocab_size * 2 - 1 && min2i >= 0 && min2i < vocab_size * 2 - 1);
parent_node[min1i] = vocab_size + i;
parent_node[min2i] = vocab_size + i;
binary[min2i] = 1;
}
assert(pos1 < 0);
//generate the huffman code for each leaf node
m_hufflabel_info.clear();
for (unsigned a = 0; a < vocab_size; ++a)
m_hufflabel_info.push_back(HuffLabelInfo());
for (unsigned a = 0; a < vocab_size; a++)
{
unsigned b = a, i = 0;
while (1)
{
assert(i < MAX_CODE_LENGTH);
code[i] = binary[b];
point[i] = b;
i++;
b = parent_node[b];
if (b == vocab_size * 2 - 2) break;
}
unsigned cur_word = ordered_words[a].first;
m_hufflabel_info[cur_word].codelen = i;
m_hufflabel_info[cur_word].point.push_back(vocab_size - 2);
for (b = 0; b < i; b++)
{
m_hufflabel_info[cur_word].code.push_back(code[i - b - 1]);
if (b)
m_hufflabel_info[cur_word].point.push_back(point[i - b] - vocab_size);
}
}
delete[] count;
count = nullptr;
delete[] binary;
binary = nullptr;
delete[] parent_node;
parent_node = nullptr;
}
void HuffmanEncoder::BuildFromTermFrequency(const char* filename)
{
FILE* fid = fopen(filename, "r");
if(fid)
{
char sz_label[MAX_WORD_SIZE];
m_dict = new Dictionary();
while (fscanf(fid, "%s", sz_label, MAX_WORD_SIZE) != EOF)
{
HuffLabelInfo info;
int freq;
fscanf(fid, "%d", &freq);
m_dict->Insert(sz_label, freq);
}
fclose(fid);
BuildHuffmanTreeFromDict();
}
else
{
printf("file open failed %s", filename);
}
}
void HuffmanEncoder::BuildFromTermFrequency(Dictionary* dict)
{
m_dict = dict;
BuildHuffmanTreeFromDict();
}
int HuffmanEncoder::GetLabelSize()
{
return m_dict->Size();
}
int HuffmanEncoder::GetLabelIdx(const char* label)
{
return m_dict->GetWordIdx(label);
}
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(char* label)
{
int idx = GetLabelIdx(label);
if (idx == -1)
return NULL;
return GetLabelInfo(idx);
}
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(int label_idx)
{
if (label_idx == -1) return NULL;
return &m_hufflabel_info[label_idx];
}
Dictionary* HuffmanEncoder::GetDict()
{
return m_dict;
}

40
src/HuffmanEncoder.h Normal file
Просмотреть файл

@ -0,0 +1,40 @@
#pragma once
#include "Dictionary.h"
const int MAX_CODE_LENGTH = 100;
struct HuffLabelInfo
{
std::vector<int> point; //internal node ids in the code path
std::vector<char> code; //huffman code
int codelen;
HuffLabelInfo()
{
codelen = 0;
point.clear();
code.clear();
}
};
class HuffmanEncoder
{
public:
HuffmanEncoder();
void Save2File(const char* filename);
void RecoverFromFile(const char* filename);
void BuildFromTermFrequency(const char* filename);
void BuildFromTermFrequency(Dictionary* dict);
int GetLabelSize();
int GetLabelIdx(const char* label);
HuffLabelInfo* GetLabelInfo(char* label);
HuffLabelInfo* GetLabelInfo(int label_idx);
Dictionary* GetDict();
private:
void BuildHuffmanTreeFromDict();
std::vector<HuffLabelInfo> m_hufflabel_info;
Dictionary* m_dict;
};

110
src/Log.cpp Normal file
Просмотреть файл

@ -0,0 +1,110 @@
#include "Log.h"
LogLevel Logger::level_ = LogLevel::Info;
std::FILE* Logger::file_ = nullptr;
Logger::Logger()
{
level_ = LogLevel::Info;
file_ = nullptr;
}
Logger::~Logger()
{
CloseLogFile();
}
void Logger::Reset(std::string filename, LogLevel level)
{
level_ = level;
file_ = nullptr;
ResetLogFile(filename);
}
int Logger::ResetLogFile(std::string filename)
{
// close the current log file
CloseLogFile();
// If the filename is specified, try to open it, or just write the
// messages to standard output if filename is empty or openning fail.
if (filename.size() > 0)
{
file_ = fopen(filename.c_str(), "w");
if (file_ == nullptr) // fail on openning file
{
Printf(LogLevel::Error, "Cannot create log file %s\n",
filename.c_str());
return -1;
}
}
return 0;
}
void Logger::ResetLogLevel(LogLevel level)
{
level_ = level;
}
int Logger::Printf(LogLevel level, const char *format, ...)
{
// omit the message with low level
if (level < level_)
{
return 0;
}
std::string level_str = GetLevelStr(level);
std::string time_str = GetSystemTime();
va_list val;
va_start(val, format);
// write the message to standard output
printf("[%s] [%s] ", level_str.c_str(), time_str.c_str());
int ret = vprintf(format, val);
fflush(stdout);
// write the message to log file
if (file_ != nullptr)
{
fprintf(file_, "[%s] [%s] ", level_str.c_str(), time_str.c_str());
vfprintf(file_, format, val);
fflush(file_);
}
va_end(val);
// If it is a FATAL error, kill the process
if (LogLevel::Fatal == level)
{
CloseLogFile();
exit(1);
}
return ret;
}
void Logger::CloseLogFile()
{
if (file_ != nullptr)
{
fclose(file_);
file_ = nullptr;
}
}
std::string Logger::GetSystemTime()
{
time_t t = time(0);
char str[64];
strftime(str, sizeof(str), "%Y-%m-%d %H:%M:%S", localtime(&t));
return str;
}
std::string Logger::GetLevelStr(LogLevel level)
{
switch (level)
{
case LogLevel::Debug: return "DEBUG";
case LogLevel::Info: return "INFO";
case LogLevel::Error: return "ERROR";
case LogLevel::Fatal: return "FATAL";
default: return "UNKNOW";
}
}

81
src/Log.h Normal file
Просмотреть файл

@ -0,0 +1,81 @@
#pragma once
#include <fstream>
#include <string>
#include <cstdarg>
#include <ctime>
/*!
* \brief A enumeration type of log message levels.
* \note The values are ordered: DEBUG < INFO < ERROR < FATAL.
*/
enum class LogLevel : int
{
Debug = 0,
Info = 1,
Error = 2,
Fatal = 3
};
/*!
* \brief The class Logger is responsible for writing log messages into
* standard output or log file.
*/
class Logger
{
public:
/*!
* \brief Creates an instance of class Logger.
*
* By default, the log messages will be written to standard output with
* minimal level of INFO. Users are able to further set the log file or
* log level with corresponding methods.
*/
Logger();
~Logger();
/*!
* \brief Reset the setting of the Logger by specifying log file
* and log level.
*
* The log message will be written to both standard output and file (if
* created successfully).
* \param filename Log file name
* \param level Log level
*/
static void Reset(std::string filename, LogLevel level = LogLevel::Info);
/*!
* \brief Resets the log file.
* \param filename The new log filename. If it is empty, the Logger
* will close current log file (if it exists).
*/
static int ResetLogFile(std::string filename);
/*!
* \brief Resets the log level.
* \param level The new log level.
*/
static void ResetLogLevel(LogLevel level);
/*!
* \brief C style formatted method for writing log messages. A message
* is with the following format: [LEVEL] [TIME] message
* \param level The log level of this message.
* \param format The C format string.
* \param ... Output items.
* \return Returns a nonnegative integer on success,
* or a negative number if error.
*/
static int Printf(LogLevel level, const char *format, ...);
private:
static void CloseLogFile();
// Returns current system time as a string.
static std::string GetSystemTime();
// Returns the string of a log level.
static std::string GetLevelStr(LogLevel level);
static LogLevel level_; // Only the message not less than level_ will be outputed.
static std::FILE *file_; // A file pointer to the log file.
};

124
src/Main.cpp Normal file
Просмотреть файл

@ -0,0 +1,124 @@
#include <thread>
#include <string>
#include <iostream>
#include <cstring>
#include <cmath>
#include <vector>
#include <fstream>
#include <sstream>
#include <multiverso.h>
#include <barrier.h>
#include "Dictionary.h"
#include "HuffmanEncoder.h"
#include "Util.h"
#include "Reader.h"
#include "MultiversoSkipGramMixture.h"
#include "ParamLoader.h"
#include "Trainer.h"
#include "SkipGramMixtureNeuralNetwork.h"
bool ReadWord(char *word, FILE *fin)
{
int idx = 0;
char ch;
while (!feof(fin))
{
ch = fgetc(fin);
if (ch == 13) continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
{
if (idx > 0)
{
if (ch == '\n')
ungetc(ch, fin);
break;
}
if (ch == '\n')
{
strcpy(word, (char *)"</s>");
return true;
}
else
{
continue;
}
}
word[idx++] = ch;
if (idx >= MAX_STRING - 1) idx--; // Truncate too long words
}
word[idx] = 0;
return idx > 0;
}
// Read the vocabulary file; create the dictionary and huffman_encoder according opt
int64_t LoadVocab(Option *opt, Dictionary *dictionary, HuffmanEncoder *huffman_encoder)
{
int64_t total_words = 0;
char word[MAX_STRING];
FILE* fid = nullptr;
printf("vocab_file %s\n", opt->read_vocab_file);
if (opt->read_vocab_file != nullptr && strlen(opt->read_vocab_file) > 0)
{
printf("Begin to load vocabulary file [%s] ...\n", opt->read_vocab_file);
fid = fopen(opt->read_vocab_file, "r");
int word_freq;
while (fscanf(fid, "%s %d", word, &word_freq) != EOF)
{
dictionary->Insert(word, word_freq);
}
}
dictionary->RemoveWordsLessThan(opt->min_count);
printf("Dictionary size: %d\n", dictionary->Size());
total_words = 0;
for (int i = 0; i < dictionary->Size(); ++i)
total_words += dictionary->GetWordInfo(i)->freq;
printf("Words in Corpus %I64d\n", total_words);
huffman_encoder->BuildFromTermFrequency(dictionary);
fclose(fid);
return total_words;
}
int main(int argc, char *argv[])
{
srand(static_cast<unsigned int>(time(NULL)));
Option *option = new Option();
Dictionary *dictionary = new Dictionary();
HuffmanEncoder *huffman_encoder = new HuffmanEncoder();
// Parse argument and store them in option
option->ParseArgs(argc, argv);
option->PrintArgs();
if (!option->CheckArgs())
{
printf("Fatal error in arguments\n");
return -1;
}
// Read the vocabulary file; create the dictionary and huffman_encoder according opt
printf("Loading vocabulary ...\n");
option->total_words = LoadVocab(option, dictionary, huffman_encoder);
printf("Loaded vocabulary\n");
fflush(stdout);
Reader *reader = new Reader(dictionary, option);
MultiversoSkipGramMixture *multiverso_word2vector = new MultiversoSkipGramMixture(option, dictionary, huffman_encoder, reader);
fflush(stdout);
multiverso_word2vector->Train(argc, argv);
delete multiverso_word2vector;
delete reader;
delete huffman_encoder;
delete dictionary;
delete option;
return 0;
}

Просмотреть файл

@ -0,0 +1,271 @@
#include "MultiversoSkipGramMixture.h"
#include <algorithm>
MultiversoSkipGramMixture::MultiversoSkipGramMixture(Option *option, Dictionary *dictionary, HuffmanEncoder *huffman_encoder, Reader *reader)
{
m_option = option;
m_dictionary = dictionary;
m_huffman_encoder = huffman_encoder;
m_reader = reader;
InitSenseCntInfo();
}
void MultiversoSkipGramMixture::InitSenseCntInfo()
{
//First, determine #senses for words according to configuration parameters: top_N and top_ratio
int threshold = (m_option->top_N ? std::min(m_option->top_N, m_dictionary->Size()) : m_dictionary->Size());
threshold = static_cast<int>(std::min(static_cast<real>(m_option->top_ratio) * m_dictionary->Size(), static_cast<real>(threshold)));
m_word_sense_info.total_senses_cnt = threshold * m_option->sense_num_multi + (m_dictionary->Size() - threshold);
std::pair<int, int64_t>* wordlist = new std::pair<int, int64_t>[m_dictionary->Size() + 10];
for (int i = 0; i < m_dictionary->Size(); ++i)
wordlist[i] = std::pair<int, int64_t>(i, m_dictionary->GetWordInfo(i)->freq);
std::sort(wordlist, wordlist + m_dictionary->Size(), [](std::pair<int, int64_t> a, std::pair<int, int64_t> b) {
return a.second > b.second;
});
m_word_sense_info.word_sense_cnts_info.resize(m_dictionary->Size());
for (int i = 0; i < threshold; ++i)
m_word_sense_info.word_sense_cnts_info[wordlist[i].first] = m_option->sense_num_multi;
for (int i = threshold; i < m_dictionary->Size(); ++i)
m_word_sense_info.word_sense_cnts_info[wordlist[i].first] = 1;
//Then, read words #sense info from the sense file
if (m_option->sense_file)
{
FILE* fid = fopen(m_option->sense_file, "r");
char word[1000];
while (fscanf(fid, "%s", word) != EOF)
{
int word_idx = m_dictionary->GetWordIdx(word);
if (word_idx == -1)
continue;
if (m_word_sense_info.word_sense_cnts_info[word_idx] == 1)
{
m_word_sense_info.word_sense_cnts_info[word_idx] = m_option->sense_num_multi;
m_word_sense_info.total_senses_cnt += (m_option->sense_num_multi - 1);
}
}
fclose(fid);
}
//At last, point pointers to the right position
m_word_sense_info.p_input_embedding.resize(m_dictionary->Size());
int cnt = 0;
m_word_sense_info.multi_senses_words_cnt = 0;
for (int i = 0; i < m_dictionary->Size(); ++i)
{
m_word_sense_info.p_input_embedding[i] = cnt;
if (m_word_sense_info.word_sense_cnts_info[i] > 1)
m_word_sense_info.p_wordidx2sense_idx[i] = m_word_sense_info.multi_senses_words_cnt++;
cnt += m_word_sense_info.word_sense_cnts_info[i];
}
printf("Total senses:%d, total multiple mearning words:%d\n", m_word_sense_info.total_senses_cnt, m_word_sense_info.multi_senses_words_cnt);
}
void MultiversoSkipGramMixture::Train(int argc, char *argv[])
{
multiverso::Barrier* barrier = new multiverso::Barrier(m_option->thread_cnt);
printf("Inited barrier\n");
SkipGramMixtureNeuralNetwork<real>* word2vector_neural_networks[2] = { new SkipGramMixtureNeuralNetwork<real>(m_option, m_huffman_encoder, &m_word_sense_info, m_dictionary, m_dictionary->Size()),
new SkipGramMixtureNeuralNetwork<real>(m_option, m_huffman_encoder, &m_word_sense_info, m_dictionary, m_dictionary->Size()) };
// Create Multiverso ParameterLoader and Trainers,
// start Multiverso environment
printf("Initializing Multiverso ...\n");
fflush(stdout);
std::vector<multiverso::TrainerBase*> trainers;
for (int i = 0; i < m_option->thread_cnt; ++i)
{
trainers.push_back(new Trainer<real>(i, m_option, (void**)word2vector_neural_networks, barrier, m_dictionary, &m_word_sense_info, m_huffman_encoder));
}
ParameterLoader<real> *parameter_loader = new ParameterLoader<real>(m_option, (void**)word2vector_neural_networks, &m_word_sense_info);
multiverso::Config config;
config.max_delay = m_option->max_delay;
config.num_servers = m_option->num_servers;
config.num_aggregator = m_option->num_aggregator;
config.lock_option = static_cast<multiverso::LockOption>(m_option->lock_option);
config.num_lock = m_option->num_lock;
config.is_pipeline = m_option->pipline;
fflush(stdout);
multiverso::Multiverso::Init(trainers, parameter_loader, config, &argc, &argv);
fflush(stdout);
multiverso::Log::ResetLogFile("log.txt");
m_process_id = multiverso::Multiverso::ProcessRank();
PrepareMultiversoParameterTables(m_option, m_dictionary);
printf("Start to train ...\n");
TrainNeuralNetwork();
printf("Rank %d Finish training\n", m_process_id);
delete barrier;
delete word2vector_neural_networks[0];
delete word2vector_neural_networks[1];
for (auto &trainer : trainers)
{
delete trainer;
}
delete parameter_loader;
multiverso::Multiverso::Close();
}
void MultiversoSkipGramMixture::AddMultiversoParameterTable(multiverso::integer_t table_id, multiverso::integer_t rows,
multiverso::integer_t cols, multiverso::Type type, multiverso::Format default_format)
{
multiverso::Multiverso::AddServerTable(table_id, rows, cols, type, default_format);
multiverso::Multiverso::AddCacheTable(table_id, rows, cols, type, default_format, 0);
multiverso::Multiverso::AddAggregatorTable(table_id, rows, cols, type, default_format, 0);
}
void MultiversoSkipGramMixture::PrepareMultiversoParameterTables(Option *opt, Dictionary *dictionary)
{
multiverso::Multiverso::BeginConfig();
int proc_count = multiverso::Multiverso::TotalProcessCount();
// create tables
AddMultiversoParameterTable(kInputEmbeddingTableId, m_word_sense_info.total_senses_cnt, opt->embeding_size, multiverso::Type::Float, multiverso::Format::Dense);
AddMultiversoParameterTable(kEmbeddingOutputTableId, dictionary->Size(), opt->embeding_size, multiverso::Type::Float, multiverso::Format::Dense);
AddMultiversoParameterTable(kWordCountActualTableId, 1, 1, multiverso::Type::LongLong, multiverso::Format::Dense);
AddMultiversoParameterTable(kWordSensePriorTableId, m_word_sense_info.multi_senses_words_cnt, m_option->sense_num_multi, multiverso::Type::Float, multiverso::Format::Dense);
// initialize input embeddings
for (int row = 0; row < m_word_sense_info.total_senses_cnt; ++row)
{
for (int col = 0; col < opt->embeding_size; ++col)
{
multiverso::Multiverso::AddToServer<real>(kInputEmbeddingTableId, row, col, static_cast<real>((static_cast<real>(rand()) / RAND_MAX - 0.5) / opt->embeding_size / proc_count));
}
}
//initialize sense priors
for (int row = 0; row < m_word_sense_info.multi_senses_words_cnt; ++row)
{
for (int col = 0; col < opt->sense_num_multi; ++col)
{
multiverso::Multiverso::AddToServer<real>(kWordSensePriorTableId, row, col,
static_cast<real>(m_option->store_multinomial ? 1.0 / m_option->sense_num_multi : log(1.0 / m_option->sense_num_multi)));
}
}
multiverso::Multiverso::EndConfig();
}
//Load the sentences from train file, and store them in data_block
void MultiversoSkipGramMixture::LoadData(DataBlock *data_block, Reader *reader, int64_t size)
{
data_block->ReleaseSentences();
while (data_block->Size() < m_option->data_block_size)
{
int64_t word_count = 0;
int *sentence = new (std::nothrow)int[MAX_SENTENCE_LENGTH + 2];
assert(sentence != nullptr);
int sentence_length = reader->GetSentence(sentence, word_count);
if (sentence_length > 0)
{
data_block->Add(sentence, sentence_length, word_count, (uint64_t)rand() * 10000 + (uint64_t)rand());
}
else
{
//Reader read eof
delete[] sentence;
return;
}
}
}
void MultiversoSkipGramMixture::PushDataBlock(
std::queue<DataBlock*> &datablock_queue, DataBlock* data_block)
{
multiverso::Multiverso::PushDataBlock(data_block);
datablock_queue.push(data_block);
//limit the max size of total datablocks to avoid out of memory
while (static_cast<int64_t>(datablock_queue.size()) > m_option->max_preload_blocks_cnt)
{
std::chrono::milliseconds dura(200);
std::this_thread::sleep_for(dura);
RemoveDoneDataBlock(datablock_queue);
}
}
//Remove the datablock which has been delt by parameterloader and trainer
void MultiversoSkipGramMixture::RemoveDoneDataBlock(std::queue<DataBlock*> &datablock_queue)
{
while (datablock_queue.empty() == false
&& datablock_queue.front()->IsDone())
{
DataBlock *p_data_block = datablock_queue.front();
datablock_queue.pop();
delete p_data_block;
}
}
void MultiversoSkipGramMixture::TrainNeuralNetwork()
{
std::queue<DataBlock*>datablock_queue;
int data_block_count = 0;
multiverso::Multiverso::BeginTrain();
for (int curr_epoch = 0; curr_epoch < m_option->epoch; ++curr_epoch)
{
m_reader->Open(m_option->train_file);
while (1)
{
++data_block_count;
DataBlock *data_block = new (std::nothrow)DataBlock();
assert(data_block != nullptr);
clock_t start = clock();
LoadData(data_block, m_reader, m_option->data_block_size);
if (data_block->Size() <= 0)
{
delete data_block;
break;
}
multiverso::Log::Info("Rank%d Load%d^thDataBlockTime:%lfs\n", m_process_id, data_block_count,
(clock() - start) / (double)CLOCKS_PER_SEC);
multiverso::Multiverso::BeginClock();
PushDataBlock(datablock_queue, data_block);
multiverso::Multiverso::EndClock();
}
m_reader->Close();
multiverso::Multiverso::BeginClock();
DataBlock *output_data_block = new DataBlock(); //Add a special data_block for dumping model files
output_data_block->AddTable(kInputEmbeddingTableId);
output_data_block->AddTable(kEmbeddingOutputTableId);
output_data_block->AddTable(kWordSensePriorTableId);
output_data_block->SetEpochId(curr_epoch);
++data_block_count;
multiverso::Multiverso::PushDataBlock(output_data_block);
multiverso::Multiverso::EndClock();
}
multiverso::Log::Info("Rank %d pushed %d blocks\n", multiverso::Multiverso::ProcessRank(), data_block_count);
multiverso::Multiverso::EndTrain();
//After EndTrain, all the datablock are done,
//we remove all the datablocks
RemoveDoneDataBlock(datablock_queue);
}

Просмотреть файл

@ -0,0 +1,78 @@
#pragma once
#include <vector>
#include <ctime>
#include <stdlib.h>
#include <string.h>
#include <unordered_set>
#include <unordered_map>
#include <multiverso.h>
#include <log.h>
#include "Util.h"
#include "HuffmanEncoder.h"
#include "DataBlock.h"
#include "ParamLoader.h"
#include "Trainer.h"
#include "Reader.h"
class MultiversoSkipGramMixture
{
public:
MultiversoSkipGramMixture(Option *option, Dictionary *dictionary, HuffmanEncoder *huffman_encoder, Reader *reader);
void Train(int argc, char *argv[]);
private:
int m_process_id;
Option* m_option;
Dictionary* m_dictionary;
HuffmanEncoder* m_huffman_encoder;
Reader* m_reader;
WordSenseInfo m_word_sense_info;
/*!
* \brief Complete the train task with multiverso
*/
void TrainNeuralNetwork();
/*!
* \brief Create a new table in the multiverso
*/
void AddMultiversoParameterTable(multiverso::integer_t table_id, multiverso::integer_t rows,
multiverso::integer_t cols, multiverso::Type type, multiverso::Format default_format);
/*!
* \brief Prepare parameter table in the multiverso
*/
void PrepareMultiversoParameterTables(Option *opt, Dictionary *dictionary);
/*!
* \brief Load data from train_file to datablock
* \param datablock the datablock which needs to be assigned
* \param reader some useful function for calling
* \param size datablock limit byte size
*/
void LoadData(DataBlock *data_block, Reader *reader, int64_t size);
/*!
* \brief Push the datablock into the multiverso and datablock_queue
*/
void PushDataBlock(std::queue<DataBlock*> &datablock_queue, DataBlock* data_block);
/*!
* \brief Remove datablock which is finished by multiverso thread
* \param datablock_queue store the pushed datablocks
*/
void RemoveDoneDataBlock(std::queue<DataBlock*> &datablock_queue);
/*!
* \brief Init the sense count info for all words
*/
void InitSenseCntInfo();
};

10
src/MultiversoTablesId.h Normal file
Просмотреть файл

@ -0,0 +1,10 @@
#pragma once
#include <multiverso.h>
/*!
* \brief Defines the index of parameter tables.
*/
const multiverso::integer_t kInputEmbeddingTableId = 0; //Input embedding vector table
const multiverso::integer_t kEmbeddingOutputTableId = 1; //Huffman tree node embedding vector table
const multiverso::integer_t kWordCountActualTableId = 2; //Word count table
const multiverso::integer_t kWordSensePriorTableId = 3; //Sense priors table

65
src/ParamLoader.cpp Normal file
Просмотреть файл

@ -0,0 +1,65 @@
#include "ParamLoader.h"
template<typename T>
ParameterLoader<T>::ParameterLoader(Option *option, void** word2vector_neural_networks, WordSenseInfo* word_sense_info)
{
m_option = option;
m_parse_and_request_count = 0;
m_sgmixture_neural_networks = word2vector_neural_networks;
m_log_file = fopen("parameter_loader.log", "w");
m_words_sense_info = word_sense_info;
}
template<typename T>
void ParameterLoader<T>::ParseAndRequest(multiverso::DataBlockBase *data_block)
{
if (m_parse_and_request_count == 0)
{
m_start_time = clock();
}
fprintf(m_log_file, "%lf\n", (clock() - m_start_time) / (double)CLOCKS_PER_SEC);
multiverso::Log::Info("Rank %d ParameterLoader begin %d\n", multiverso::Multiverso::ProcessRank(), m_parse_and_request_count);
DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
SkipGramMixtureNeuralNetwork<T>* sg_mixture_neural_network = reinterpret_cast<SkipGramMixtureNeuralNetwork<T>*>(m_sgmixture_neural_networks[m_parse_and_request_count % 2]);
++m_parse_and_request_count;
data->UpdateNextRandom();
sg_mixture_neural_network->PrepareParmeter(data);
std::vector<int>& input_layer_nodes = sg_mixture_neural_network->GetInputLayerNodes();
std::vector<int>& output_layer_nodes = sg_mixture_neural_network->GetOutputLayerNodes();
assert(sg_mixture_neural_network->status == 0);
sg_mixture_neural_network->status = 1;
for (int i = 0; i < input_layer_nodes.size(); ++i)
{
int word_id = input_layer_nodes[i];
for (int j = 0; j < m_words_sense_info->word_sense_cnts_info[word_id]; ++j)
RequestRow(kInputEmbeddingTableId, m_words_sense_info->p_input_embedding[word_id] + j);
}
for (int i = 0; i < output_layer_nodes.size(); ++i)
RequestRow(kEmbeddingOutputTableId, output_layer_nodes[i]);
RequestRow(kWordCountActualTableId, 0);
for (int i = 0; i < input_layer_nodes.size(); ++i)
{
int word_id = input_layer_nodes[i];
if (m_words_sense_info->word_sense_cnts_info[word_id] > 1)
RequestRow(kWordSensePriorTableId, m_words_sense_info->p_wordidx2sense_idx[word_id]);
}
std::vector<int> & tables = data->GetTables();
for (int i = 0; i < tables.size(); ++i)
RequestTable(tables[i]);
multiverso::Log::Info("Rank %d ParameterLoader finish %d\n", multiverso::Multiverso::ProcessRank(), m_parse_and_request_count - 1);
fprintf(m_log_file, "%lf\n", (clock() - m_start_time) / (double)CLOCKS_PER_SEC);
assert(sg_mixture_neural_network->status == 1);
sg_mixture_neural_network->status = 2;
}
template class ParameterLoader<float>;
template class ParameterLoader<double>;

34
src/ParamLoader.h Normal file
Просмотреть файл

@ -0,0 +1,34 @@
#pragma once
#include <multiverso.h>
#include "DataBlock.h"
#include "MultiversoTablesId.h"
#include "Util.h"
#include "HuffmanEncoder.h"
#include "SkipGramMixtureNeuralNetwork.h"
#include "Log.h"
/*!
* \brief The class ParameterLoader preloads the parameters from multiverso server
*/
template<typename T>
class ParameterLoader : public multiverso::ParameterLoaderBase
{
public:
ParameterLoader(Option *opt, void ** word2vector_neural_networks, WordSenseInfo* word_sense_info);
/*!
* \brief Request the parameters from multiverso server according to data_block
* \param data_block stores the information of sentences
*/
void ParseAndRequest(multiverso::DataBlockBase* data_block) override;
private:
int m_parse_and_request_count;
Option* m_option;
clock_t m_start_time;
WordSenseInfo* m_words_sense_info;
void ** m_sgmixture_neural_networks;
FILE* m_log_file;
};

85
src/Reader.cpp Normal file
Просмотреть файл

@ -0,0 +1,85 @@
#include "Reader.h"
Reader::Reader(Dictionary *dictionary, Option *option)
{
m_dictionary = dictionary;
m_option = option;
m_stopwords_table.clear();
if (m_option->stopwords)
{
FILE* fid = fopen(m_option->sw_file, "r");
while (ReadWord(m_word, fid))
{
m_stopwords_table.insert(m_word);
if (m_dictionary->GetWordIdx(m_word) != -1)
m_option->total_words -= m_dictionary->GetWordInfo(m_word)->freq;
}
fclose(fid);
}
}
void Reader::Open(const char *input_file)
{
m_fin = fopen(input_file, "r");
}
void Reader::Close()
{
fclose(m_fin);
m_fin = nullptr;
}
int Reader::GetSentence(int *sentence, int64_t &word_count)
{
int length = 0, word_idx;
word_count = 0;
while (1)
{
if (!ReadWord(m_word, m_fin))
break;
word_idx = m_dictionary->GetWordIdx(m_word);
if (word_idx == -1)
continue;
word_count++;
if (m_option->stopwords && m_stopwords_table.count(m_word))
continue;
sentence[length++] = word_idx;
if (length >= MAX_SENTENCE_LENGTH)
break;
}
return length;
}
bool Reader::ReadWord(char *word, FILE *fin)
{
int idx = 0;
char ch;
while (!feof(fin))
{
ch = fgetc(fin);
if (ch == 13) continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
{
if (idx > 0)
{
if (ch == '\n')
ungetc(ch, fin);
break;
}
if (ch == '\n')
{
strcpy(word, (char *)"</s>");
return true;
}
else continue;
}
word[idx++] = ch;
if (idx >= MAX_STRING - 1) idx--; // Truncate too long words
}
word[idx] = 0;
return idx != 0;
}

24
src/Reader.h Normal file
Просмотреть файл

@ -0,0 +1,24 @@
#pragma once
#include "Util.h"
#include "Dictionary.h"
#include <mutex>
#include <unordered_set>
class Reader
{
public:
Reader(Dictionary *dictionary, Option *option);
void Open(const char *input_file);
void Close();
int GetSentence(int *sentence, int64_t &word_count);
private:
Option* m_option;
FILE* m_fin;
char m_word[MAX_STRING + 1];
Dictionary *m_dictionary;
std::unordered_set<std::string> m_stopwords_table;
bool ReadWord(char *word, FILE *fin);
};

Просмотреть файл

@ -0,0 +1,333 @@
#include "SkipGramMixtureNeuralNetwork.h"
template<typename T>
SkipGramMixtureNeuralNetwork<T>::SkipGramMixtureNeuralNetwork(Option* option, HuffmanEncoder* huffmanEncoder, WordSenseInfo* word_sense_info, Dictionary* dic, int dicSize)
{
status = 0;
m_option = option;
m_huffman_encoder = huffmanEncoder;
m_word_sense_info = word_sense_info;
m_dictionary_size = dicSize;
m_dictionary = dic;
m_input_embedding_weights_ptr = new T*[m_dictionary_size];
m_sense_priors_ptr = new T*[m_dictionary_size];
m_sense_priors_paras_ptr = new T*[m_dictionary_size];
m_output_embedding_weights_ptr = new T*[m_dictionary_size];
m_seleted_input_embedding_weights = new bool[m_dictionary_size];
m_selected_output_embedding_weights = new bool[m_dictionary_size];
assert(m_input_embedding_weights_ptr != nullptr);
assert(m_output_embedding_weights_ptr != nullptr);
assert(m_seleted_input_embedding_weights != nullptr);
assert(m_selected_output_embedding_weights != nullptr);
memset(m_seleted_input_embedding_weights, 0, sizeof(bool) * m_dictionary_size);
memset(m_selected_output_embedding_weights, 0, sizeof(bool) * m_dictionary_size);
}
template<typename T>
SkipGramMixtureNeuralNetwork<T>::~SkipGramMixtureNeuralNetwork()
{
delete m_input_embedding_weights_ptr;
delete m_output_embedding_weights_ptr;
delete m_sense_priors_ptr;
delete m_sense_priors_paras_ptr;
delete m_seleted_input_embedding_weights;
delete m_selected_output_embedding_weights;
}
template<typename T>
void SkipGramMixtureNeuralNetwork<T>::Train(int* sentence, int sentence_length, T* gamma, T* fTable, T* input_backup)
{
ParseSentence(sentence, sentence_length, gamma, fTable, input_backup, &SkipGramMixtureNeuralNetwork<T>::TrainSample);
}
template<typename T>
//The E - step, estimate the posterior multinomial probabilities
T SkipGramMixtureNeuralNetwork<T>::Estimate_Gamma_m(int word_input, std::vector<std::pair<int, int> >& output_nodes, T* posterior_ll, T* estimation, T* sense_prior, T* f_m)
{
T* inputEmbedding = m_input_embedding_weights_ptr[word_input];
T f, log_likelihood = 0;
for (int sense_idx = 0; sense_idx < m_word_sense_info->word_sense_cnts_info[word_input]; ++sense_idx, inputEmbedding += m_option->embeding_size)
{
posterior_ll[sense_idx] = sense_prior[sense_idx] < eps ? MIN_LOG : log(sense_prior[sense_idx]); //posterior likelihood for each sense
int64_t fidx = sense_idx * MAX_CODE_LENGTH;
for (int d = 0; d < output_nodes.size(); ++d, fidx++)
{
f = Util::InnerProduct(inputEmbedding, m_output_embedding_weights_ptr[output_nodes[d].first], m_option->embeding_size);
f = Util::Sigmoid(f);
f_m[fidx] = f;
if (output_nodes[d].second) //huffman code, 0 or 1
f = 1 - f;
posterior_ll[sense_idx] += f < eps ? MIN_LOG : log(f);
}
log_likelihood += posterior_ll[sense_idx];
}
if (m_word_sense_info->word_sense_cnts_info[word_input] == 1)
{
estimation[0] = 1;
return log_likelihood;
}
Util::SoftMax(posterior_ll, estimation, m_word_sense_info->word_sense_cnts_info[word_input]);
return log_likelihood;
}
template<typename T>
//The M Step: update the sense prior probabilities to maximize the Q function
void SkipGramMixtureNeuralNetwork<T>::Maximize_Pi(int word_input, T* log_likelihood)
{
if (m_word_sense_info->word_sense_cnts_info[word_input] == 1)
{
return;
}
for (int sense_idx = 0; sense_idx < m_word_sense_info->word_sense_cnts_info[word_input]; ++sense_idx)
{
T new_alpha = log_likelihood[sense_idx];
m_sense_priors_paras_ptr[word_input][sense_idx] = m_sense_priors_paras_ptr[word_input][sense_idx] * sense_prior_momentum + new_alpha * (1 - sense_prior_momentum);
}
if (!m_option->store_multinomial)
Util::SoftMax(m_sense_priors_paras_ptr[word_input], m_sense_priors_ptr[word_input], m_option->sense_num_multi); //Update the multinomial parameters
}
template<typename T>
//The M step : update the embedding vectors to maximize the Q function
void SkipGramMixtureNeuralNetwork<T>::UpdateEmbeddings(int word_input, std::vector<std::pair<int, int> >& output_nodes, T* estimation, T* f_m, T* input_backup, UpdateDirection direction)
{
T g;
T* output_embedding;
T* inputEmbedding;
if (direction == UpdateDirection::UPDATE_INPUT)
inputEmbedding = m_input_embedding_weights_ptr[word_input];
else inputEmbedding = input_backup;
for (int sense_idx = 0; sense_idx < m_word_sense_info->word_sense_cnts_info[word_input]; ++sense_idx, inputEmbedding += m_option->embeding_size)
{
int64_t fidx = sense_idx * MAX_CODE_LENGTH;
for (int d = 0; d < output_nodes.size(); ++d, ++fidx)
{
output_embedding = m_output_embedding_weights_ptr[output_nodes[d].first];
g = estimation[sense_idx] * (1 - output_nodes[d].second - f_m[fidx]) * learning_rate;
if (direction == UpdateDirection::UPDATE_INPUT) //Update Input
{
for (int j = 0; j < m_option->embeding_size; ++j)
inputEmbedding[j] += g * output_embedding[j];
}
else // Update Output
{
for (int j = 0; j < m_option->embeding_size; ++j)
output_embedding[j] += g * inputEmbedding[j];
}
}
}
}
template<typename T>
//Train a window sample and update the input embedding & output embedding vectors
void SkipGramMixtureNeuralNetwork<T>::TrainSample(int input_node, std::vector<std::pair<int, int> >& output_nodes, void* v_gamma, void* v_fTable, void* v_input_backup)
{
T* gamma = (T*)v_gamma; //stores the posterior probabilities
T* fTable = (T*)v_fTable; //stores the inner product values of input and output embeddings
T* input_backup = (T*)v_input_backup;
T posterior_ll[MAX_SENSE_CNT]; //stores the posterior log likelihood
T senses[1] = { 1.0 }; //For those words with only one sense
T* sense_prior = m_word_sense_info->word_sense_cnts_info[input_node] == 1 ? senses : (m_option->store_multinomial ? m_sense_priors_paras_ptr[input_node] : m_sense_priors_ptr[input_node]);
T log_likelihood;
for (int iter = 0; iter < m_option->EM_iteration; ++iter)
{
// backup input embeddings
memcpy(input_backup, m_input_embedding_weights_ptr[input_node], m_option->embeding_size * m_word_sense_info->word_sense_cnts_info[input_node] * sizeof(T));
log_likelihood = 0;
// E-Step
log_likelihood += Estimate_Gamma_m(input_node, output_nodes, posterior_ll, gamma, sense_prior, fTable);
// M-Step
if (m_option->store_multinomial)
Maximize_Pi(input_node, gamma);
else
Maximize_Pi(input_node, posterior_ll);
UpdateEmbeddings(input_node, output_nodes, gamma, fTable, input_backup, UpdateDirection::UPDATE_INPUT);
UpdateEmbeddings(input_node, output_nodes, gamma, fTable, input_backup, UpdateDirection::UPDATE_OUTPUT);
}
}
template<typename T>
//Collect all the input words and output nodes in the data block
void SkipGramMixtureNeuralNetwork<T>::PrepareParmeter(DataBlock* data_block)
{
for (int i = 0; i < m_input_layer_nodes.size(); ++i)
{
m_input_embedding_weights_ptr[m_input_layer_nodes[i]] = nullptr;
m_seleted_input_embedding_weights[m_input_layer_nodes[i]] = false;
}
for (int i = 0; i < m_output_layer_nodes.size(); ++i)
{
m_output_embedding_weights_ptr[m_output_layer_nodes[i]] = nullptr;
m_selected_output_embedding_weights[m_output_layer_nodes[i]] = false;
}
m_input_layer_nodes.clear();
m_output_layer_nodes.clear();
int sentence_length;
int64_t word_count_deta;
int* sentence;
uint64_t next_random;
for (int i = 0; i < data_block->Size(); ++i)
{
data_block->Get(i, sentence, sentence_length, word_count_deta, next_random);
ParseSentence(sentence, sentence_length, nullptr, nullptr, nullptr, &SkipGramMixtureNeuralNetwork<T>::DealPrepareParameter);
}
}
template<typename T>
//Copy the input_nodes&output_nodes to private set
void SkipGramMixtureNeuralNetwork<T>::DealPrepareParameter(int input_node, std::vector<std::pair<int, int> >& output_nodes, void* v_gamma, void* v_fTable, void* v_input_backup)
{
AddInputLayerNode(input_node);
for (int i = 0; i < output_nodes.size(); ++i)
AddOutputLayerNode(output_nodes[i].first);
}
template<typename T>
/*
Parse a sentence and deepen into two branchs:
one for TrainNN, the other one is for Parameter_parse&request
*/
void SkipGramMixtureNeuralNetwork<T>::ParseSentence(int* sentence, int sentence_length, T* gamma, T* fTable, T* input_backup, FunctionType function)
{
if (sentence_length == 0)
return;
int feat[MAX_SENTENCE_LENGTH + 10];
int input_node;
std::vector<std::pair<int, int> > output_nodes;
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
{
if (sentence[sentence_position] == -1) continue;
int feat_size = 0;
for (int i = 0; i < m_option->window_size * 2 + 1; ++i)
if (i != m_option->window_size)
{
int c = sentence_position - m_option->window_size + i;
if (c < 0 || c >= sentence_length || sentence[c] == -1) continue;
feat[feat_size++] = sentence[c];
//Begin: Train SkipGram
{
input_node = feat[feat_size - 1];
output_nodes.clear();
Parse(input_node, sentence[sentence_position], output_nodes);
(this->*function)(input_node, output_nodes, gamma, fTable, input_backup);
}
}
}
}
template<typename T>
//Parse the needed parameter in a window
void SkipGramMixtureNeuralNetwork<T>::Parse(int feat, int out_word_idx, std::vector<std::pair<int, int> >& output_nodes)
{
const auto info = m_huffman_encoder->GetLabelInfo(out_word_idx);
for (int d = 0; d < info->codelen; d++)
output_nodes.push_back(std::make_pair(info->point[d], info->code[d]));
}
template<typename T>
void SkipGramMixtureNeuralNetwork<T>::AddInputLayerNode(int node_id)
{
if (m_seleted_input_embedding_weights[node_id] == false)
{
m_seleted_input_embedding_weights[node_id] = true;
m_input_layer_nodes.push_back(node_id);
}
}
template<typename T>
void SkipGramMixtureNeuralNetwork<T>::AddOutputLayerNode(int node_id)
{
if (m_selected_output_embedding_weights[node_id] == false)
{
m_selected_output_embedding_weights[node_id] = true;
m_output_layer_nodes.push_back(node_id);
}
}
template<typename T>
std::vector<int>& SkipGramMixtureNeuralNetwork<T>::GetInputLayerNodes()
{
return m_input_layer_nodes;
}
template<typename T>
std::vector<int>& SkipGramMixtureNeuralNetwork<T>::GetOutputLayerNodes()
{
return m_output_layer_nodes;
}
template<typename T>
void SkipGramMixtureNeuralNetwork<T>::SetInputEmbeddingWeights(int input_node_id, T* ptr)
{
m_input_embedding_weights_ptr[input_node_id] = ptr;
}
template<typename T>
void SkipGramMixtureNeuralNetwork<T>::SetOutputEmbeddingWeights(int output_node_id, T* ptr)
{
m_output_embedding_weights_ptr[output_node_id] = ptr;
}
template <typename T>
void SkipGramMixtureNeuralNetwork<T>::SetSensePriorWeights(int input_node_id, T*ptr)
{
m_sense_priors_ptr[input_node_id] = ptr;
}
template <typename T>
void SkipGramMixtureNeuralNetwork<T>::SetSensePriorParaWeights(int input_node_id, T* ptr)
{
m_sense_priors_paras_ptr[input_node_id] = ptr;
}
template<typename T>
T* SkipGramMixtureNeuralNetwork<T>::GetInputEmbeddingWeights(int input_node_id)
{
return m_input_embedding_weights_ptr[input_node_id];
}
template<typename T>
T* SkipGramMixtureNeuralNetwork<T>::GetEmbeddingOutputWeights(int output_node_id)
{
return m_output_embedding_weights_ptr[output_node_id];
}
template<typename T>
T* SkipGramMixtureNeuralNetwork<T>::GetSensePriorWeights(int input_node_id)
{
return m_sense_priors_ptr[input_node_id];
}
template<typename T>
T* SkipGramMixtureNeuralNetwork<T>::GetSensePriorParaWeights(int input_node_id)
{
return m_sense_priors_paras_ptr[input_node_id];
}
template class SkipGramMixtureNeuralNetwork<float>;
template class SkipGramMixtureNeuralNetwork<double>;

Просмотреть файл

@ -0,0 +1,140 @@
#pragma once
#include <vector>
#include "Util.h"
#include <multiverso.h>
#include "HuffmanEncoder.h"
#include "MultiversoSkipGramMixture.h"
#include "cstring"
enum class UpdateDirection
{
UPDATE_INPUT,
UPDATE_OUTPUT
};
template<typename T>
class SkipGramMixtureNeuralNetwork
{
public:
T learning_rate;
T sense_prior_momentum;
int status;
SkipGramMixtureNeuralNetwork(Option* option, HuffmanEncoder* huffmanEncoder, WordSenseInfo* word_sense_info, Dictionary* dic, int dicSize);
~SkipGramMixtureNeuralNetwork();
void Train(int* sentence, int sentence_length, T* gamma, T* fTable, T* input_backup);
/*!
* \brief Collect all the input words and output nodes in the data block
*/
void PrepareParmeter(DataBlock *data_block);
std::vector<int>& GetInputLayerNodes();
std::vector<int>& GetOutputLayerNodes();
/*!
* \brief Set the pointers to those local parameters
*/
void SetInputEmbeddingWeights(int input_node_id, T* ptr);
void SetOutputEmbeddingWeights(int output_node_id, T* ptr);
void SetSensePriorWeights(int input_node_id, T*ptr);
void SetSensePriorParaWeights(int input_node_id, T* ptr);
/*!
* \brief Get the pointers to those locally updated parameters
*/
T* GetInputEmbeddingWeights(int input_node_id);
T* GetEmbeddingOutputWeights(int output_node_id);
T* GetSensePriorWeights(int input_node_id);
T* GetSensePriorParaWeights(int input_node_id);
private:
Option *m_option;
Dictionary *m_dictionary;
HuffmanEncoder *m_huffman_encoder;
int m_dictionary_size;
WordSenseInfo* m_word_sense_info;
T** m_input_embedding_weights_ptr; //Points to every word's input embedding vector
bool *m_seleted_input_embedding_weights;
T** m_output_embedding_weights_ptr; //Points to every huffman node's embedding vector
bool *m_selected_output_embedding_weights;
T** m_sense_priors_ptr; //Points to the multinomial parameters, if store_multinomial is set to zero.
T** m_sense_priors_paras_ptr;//Points to sense prior parameters. If store_multinomial is zero, then it points to the log of multinomial, otherwise points to the multinomial parameters
std::vector<int> m_input_layer_nodes;
std::vector<int> m_output_layer_nodes;
typedef void(SkipGramMixtureNeuralNetwork<T>::*FunctionType)(int input_node, std::vector<std::pair<int, int> >& output_nodes, void* v_gamma, void* v_fTable, void* v_input_backup);
/*!
* \brief Parse the needed parameter in a window
*/
void Parse(int feat, int word_idx, std::vector<std::pair<int, int> >& output_nodes);
/*!
* \brief Parse a sentence and deepen into two branchs
* \one for TrainNN,the other one is for Parameter_parse&request
*/
void ParseSentence(int* sentence, int sentence_length, T* gamma, T* fTable, T* input_backup, FunctionType function);
/*!
* \brief Copy the input_nodes&output_nodes to WordEmbedding private set
*/
void DealPrepareParameter(int input_nodes, std::vector<std::pair<int, int> >& output_nodes, void* v_gamma, void* v_fTable, void* v_input_backup);
/*!
* \brief Train a window sample and update the
* \input-embedding&output-embedding vectors
* \param word_input represent the input words
* \param output_nodes represent the ouput nodes on huffman tree, including the node index and path label
* \param v_gamma is the temp memory to store the posterior probabilities of each sense
* \param v_fTable is the temp memory to store the sigmoid value of inner product of input and output embeddings
* \param v_input_backup stores the input embedding vectors as backup
*/
void TrainSample(int word_input, std::vector<std::pair<int, int> >& output_nodes, void* v_gamma, void* v_fTable, void* v_input_backup);
/*!
* \brief The E-step, estimate the posterior multinomial probabilities
* \param word_input represent the input words
* \param output_nodes represent the ouput nodes on huffman tree, including the node index and path label
* \param posterior represents the calculated posterior log likelihood
* \param estimation represents the calculated gammas (see the paper), that is, the softmax terms of posterior
* \param sense_prior represents the parameters of sense prior probablities for each polysemous words
* \param f_m is the temp memory to store the sigmoid value of inner products of input and output embeddings
*/
T Estimate_Gamma_m(int word_input, std::vector<std::pair<int, int> >& output_nodes, T* posterior, T* estimation, T* sense_prior, T* f_m);
/*!
* \brief The M step: update the embedding vectors to maximize the Q function
* \param word_input represent the input words
* \param output_nodes represent the ouput nodes on huffman tree, including the node index and path label
* \param estimation represents the calculated gammas (see the paper), that is, the softmax terms of posterior
* \param f_m is the temp memory to store the sigmoid value of inner products of input and output embeddings
* \param input_backup stores the input embedding vectors as backup
* \param direction: update input vectors or output vectors
*/
void UpdateEmbeddings(int word_input, std::vector<std::pair<int, int> >& output_nodes, T* estimation, T* f_m, T* input_backup, UpdateDirection direction);
/*!
* \brief The M Step: update the sense prior probabilities to maximize the Q function
* \param word_input represent the input words
* \param curr_priors are the closed form values of the sense priors in this iteration
*/
void Maximize_Pi(int word_input, T* curr_priors);
/*
* \brief Record the input word so that parameter loader can be performed
*/
void AddInputLayerNode(int node_id);
/*
* \brief Record the huffman tree node so that parameter loader can be performed
*/
void AddOutputLayerNode(int node_id);
};

445
src/Trainer.cpp Normal file
Просмотреть файл

@ -0,0 +1,445 @@
#include "Trainer.h"
template<typename T>
Trainer<T>::Trainer(int trainer_id, Option *option, void** word2vector_neural_networks, multiverso::Barrier *barrier, Dictionary* dictionary, WordSenseInfo* word_sense_info, HuffmanEncoder* huff_encoder)
{
m_trainer_id = trainer_id;
m_option = option;
m_word_count = m_last_word_count = 0;
m_sgmixture_neural_networks = word2vector_neural_networks;
m_barrier = barrier;
m_dictionary = dictionary;
m_word_sense_info = word_sense_info;
m_huffman_encoder = huff_encoder;
gamma = (T*)calloc(m_option-> window_size * MAX_SENSE_CNT, sizeof(T));
fTable = (T*)calloc(m_option-> window_size * MAX_CODE_LENGTH * MAX_SENSE_CNT, sizeof(T));
input_backup = (T*)calloc(m_option->embeding_size * MAX_SENSE_CNT, sizeof(T));
m_start_time = 0;
m_train_count = 0;
m_executive_time = 0;
if (m_trainer_id == 0)
{
m_log_file = fopen("trainer.log", "w");
}
}
template<typename T>
//Train one datablock
void Trainer<T>::TrainIteration(multiverso::DataBlockBase *data_block)
{
if (m_train_count == 0)
{
m_start_time = clock();
m_process_id = multiverso::Multiverso::ProcessRank();
}
printf("Rank %d Begin TrainIteration...%d\n", m_process_id, m_train_count);
clock_t train_interation_start = clock();
fflush(stdout);
m_process_count = multiverso::Multiverso::TotalProcessCount();
DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
SkipGramMixtureNeuralNetwork<T>* word2vector_neural_network = reinterpret_cast<SkipGramMixtureNeuralNetwork<T>*>(m_sgmixture_neural_networks[m_train_count % 2]);
++m_train_count;
std::vector<int>& input_layer_nodes = word2vector_neural_network->GetInputLayerNodes();
std::vector<int>& output_layer_nodes = word2vector_neural_network->GetOutputLayerNodes();
std::vector<int> local_input_layer_nodes, local_output_layer_nodes;
assert(word2vector_neural_network->status == 2);
if (m_trainer_id == 0)
{
multiverso::Log::Info("Rank %d input_layer_size=%d, output_layer_size=%d\n", m_process_id, input_layer_nodes.size(), output_layer_nodes.size());
}
for (int i = m_trainer_id; i < input_layer_nodes.size(); i += m_option->thread_cnt)
{
local_input_layer_nodes.push_back(input_layer_nodes[i]);
}
for (int i = m_trainer_id; i < output_layer_nodes.size(); i += m_option->thread_cnt)
{
local_output_layer_nodes.push_back(output_layer_nodes[i]);
}
CopyParameterFromMultiverso(local_input_layer_nodes, local_output_layer_nodes, word2vector_neural_network);
multiverso::Row<int64_t>& word_count_actual_row = GetRow<int64_t>(kWordCountActualTableId, 0);
T learning_rate = m_option->init_learning_rate * (1 - word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1));
if (learning_rate < m_option->init_learning_rate * (real)0.0001)
learning_rate = m_option->init_learning_rate * (real)0.0001;
word2vector_neural_network->learning_rate = learning_rate;
//Linearly increase the momentum from init_sense_prior_momentum to 1
word2vector_neural_network->sense_prior_momentum = m_option->init_sense_prior_momentum +
(1 - m_option->init_sense_prior_momentum) * word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1);
m_barrier->Wait();
for (int i = m_trainer_id; i < data->Size(); i += m_option->thread_cnt) //i iterates over all sentences
{
int sentence_length;
int64_t word_count_deta;
int *sentence;
uint64_t next_random;
data->Get(i, sentence, sentence_length, word_count_deta, next_random);
word2vector_neural_network->Train(sentence, sentence_length, gamma, fTable, input_backup);
m_word_count += word_count_deta;
if (m_word_count - m_last_word_count > 10000)
{
multiverso::Row<int64_t>& word_count_actual_row = GetRow<int64_t>(kWordCountActualTableId, 0);
Add<int64_t>(kWordCountActualTableId, 0, 0, m_word_count - m_last_word_count);
m_last_word_count = m_word_count;
m_now_time = clock();
if (m_trainer_id % 3 == 0)
{
multiverso::Log::Info("Rank %d Trainer %d lr: %.5f Mom: %.4f Progress: %.2f%% Words/thread/sec(total): %.2fk W/t/sec(executive): %.2fk\n",
m_process_id, m_trainer_id,
word2vector_neural_network->learning_rate, word2vector_neural_network->sense_prior_momentum,
word_count_actual_row.At(0) / (real)(m_option->total_words * m_option->epoch + 1) * 100,
m_last_word_count / ((real)(m_now_time - m_start_time + 1) / (real)CLOCKS_PER_SEC * 1000),
m_last_word_count / ((real)(m_executive_time + clock() - train_interation_start + 1) / (real)CLOCKS_PER_SEC * 1000));
fflush(stdout);
}
T learning_rate = m_option->init_learning_rate * (1 - word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1));
if (learning_rate < m_option->init_learning_rate * (real)0.0001)
learning_rate = m_option->init_learning_rate * (real)0.0001;
word2vector_neural_network->learning_rate = learning_rate;
word2vector_neural_network->sense_prior_momentum = m_option->init_sense_prior_momentum + (1 - m_option->init_sense_prior_momentum) * word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1);
}
}
m_barrier->Wait();
AddParameterToMultiverso(local_input_layer_nodes, local_output_layer_nodes, word2vector_neural_network);
m_executive_time += clock() - train_interation_start;
multiverso::Log::Info("Rank %d Train %d end at %lfs, cost %lfs, total cost %lfs\n",
m_process_id,
m_trainer_id, clock() / (double)CLOCKS_PER_SEC,
(clock() - train_interation_start) / (double)CLOCKS_PER_SEC,
m_executive_time / (double)CLOCKS_PER_SEC);
fflush(stdout);
if (data->GetTables().size() > 0 && m_trainer_id == 0) //Dump model files
{
SaveMultiInputEmbedding(data->GetEpochId());
SaveOutputEmbedding(data->GetEpochId());
if (data->GetEpochId() == 0)
SaveHuffEncoder();
fprintf(m_log_file, "%d %lf\t %lf\n", data->GetEpochId(), (clock() - m_start_time) / (double)CLOCKS_PER_SEC, m_executive_time / (double)CLOCKS_PER_SEC);
}
assert(word2vector_neural_network->status == 2);
word2vector_neural_network->status = 0;
multiverso::Log::Info("Rank %d Train %d are leaving training iter with nn status:%d\n", m_process_id, m_trainer_id, word2vector_neural_network->status);
fflush(stdout);
}
template<typename T>
//Copy a size of memory from source row to dest
void Trainer<T>::CopyMemory(T* dest, multiverso::Row<T>& source, int size)
{
for (int i = 0; i < size; ++i)
dest[i] = source.At(i);
}
template<typename T>
//Copy the needed parameter from buffer to local blocks
int Trainer<T>::CopyParameterFromMultiverso(std::vector<int>& input_layer_nodes, std::vector<int>& output_layer_nodes, void* local_word2vector_neural_network)
{
SkipGramMixtureNeuralNetwork<T>* word2vector_neural_network = (SkipGramMixtureNeuralNetwork<T>*)local_word2vector_neural_network;
//Copy input embedding
for (int i = 0; i < input_layer_nodes.size(); ++i)
{
T* ptr = (T*)calloc(m_word_sense_info->word_sense_cnts_info[input_layer_nodes[i]] * m_option->embeding_size, sizeof(T));
int row_id_base = m_word_sense_info->p_input_embedding[input_layer_nodes[i]];
for (int j = 0, row_id = row_id_base; j < m_word_sense_info->word_sense_cnts_info[input_layer_nodes[i]]; ++j, ++row_id)
CopyMemory(ptr + j * m_option->embeding_size, GetRow<T>(kInputEmbeddingTableId, row_id), m_option->embeding_size);
word2vector_neural_network->SetInputEmbeddingWeights(input_layer_nodes[i], ptr);
}
//Copy output embedding
for (int i = 0; i < output_layer_nodes.size(); ++i)
{
T* ptr = (T*)calloc(m_option->embeding_size, sizeof(T));
CopyMemory(ptr, GetRow<T>(kEmbeddingOutputTableId, output_layer_nodes[i]), m_option->embeding_size);
for (int j = 0; j < m_option->embeding_size; j += 5)
if (!Util::ValidF(static_cast<real>(ptr[j])))
{
printf("invalid number\n");
fflush(stdout);
throw std::runtime_error("Invalid output embeddings");
}
word2vector_neural_network->SetOutputEmbeddingWeights(output_layer_nodes[i], ptr);
}
//Copy sense prior
for (int i = 0; i < input_layer_nodes.size(); ++i)
{
if (m_word_sense_info->word_sense_cnts_info[input_layer_nodes[i]] > 1)
{
T* ptr = (T*)calloc(m_option->sense_num_multi, sizeof(T));
T* para_ptr = (T*)calloc(m_option->sense_num_multi, sizeof(T));
CopyMemory(para_ptr, GetRow<T>(kWordSensePriorTableId, m_word_sense_info->p_wordidx2sense_idx[input_layer_nodes[i]]), m_option->sense_num_multi);
if (!m_option->store_multinomial)//softmax the para_ptr to obtain the multinomial parameters
Util::SoftMax(para_ptr, ptr, m_option->sense_num_multi);
word2vector_neural_network->SetSensePriorWeights(input_layer_nodes[i], ptr);
word2vector_neural_network->SetSensePriorParaWeights(input_layer_nodes[i], para_ptr);
}
}
return 0;
}
template<typename T>
//Add delta of a row of local parameters to the parameter stored in the buffer and send it to multiverso
void Trainer<T>::AddParameterRowToMultiverso(T* ptr, int table_id, int row_id, int size, real momentum)
{
multiverso::Row<T>& row = GetRow<T>(table_id, row_id);
for (int i = 0; i < size; ++i)
{
T dest = ptr[i] * (1 - momentum) + row.At(i) * momentum;
T delta = (dest - row.At(i)) / m_process_count;
Add<T>(table_id, row_id, i, delta);
}
}
template<typename T>
//Add delta to the parameter stored in the buffer and send it to multiverso
int Trainer<T>::AddParameterToMultiverso(std::vector<int>& input_layer_nodes, std::vector<int>& output_layer_nodes, void* local_word2vector_neural_network)
{
SkipGramMixtureNeuralNetwork<T>* word2vector_neural_network = (SkipGramMixtureNeuralNetwork<T>*)local_word2vector_neural_network;
std::vector<T*> blocks; //used to store locally malloced memorys
//Add input embeddings
for (int i = 0; i < input_layer_nodes.size(); ++i)
{
int table_id = kInputEmbeddingTableId;
int row_id_base = m_word_sense_info->p_input_embedding[input_layer_nodes[i]];
T* ptr = word2vector_neural_network->GetInputEmbeddingWeights(input_layer_nodes[i]);
for (int j = 0, row_id = row_id_base; j < m_word_sense_info->word_sense_cnts_info[input_layer_nodes[i]]; ++j, ++row_id)
AddParameterRowToMultiverso(ptr + m_option->embeding_size * j, table_id, row_id, m_option->embeding_size);
blocks.push_back(ptr);
}
//Add output embeddings
for (int i = 0; i < output_layer_nodes.size(); ++i)
{
int table_id = kEmbeddingOutputTableId;
int row_id = output_layer_nodes[i];
T* ptr = word2vector_neural_network->GetEmbeddingOutputWeights(row_id);
AddParameterRowToMultiverso(ptr, table_id, row_id, m_option->embeding_size);
blocks.push_back(ptr);
}
//Add sense priors
for (int i = 0; i < input_layer_nodes.size(); ++i)
{
if (m_word_sense_info->word_sense_cnts_info[input_layer_nodes[i]] > 1)
{
int table_id = kWordSensePriorTableId;
int row_id = m_word_sense_info->p_wordidx2sense_idx[input_layer_nodes[i]];
T* ptr = word2vector_neural_network->GetSensePriorWeights(input_layer_nodes[i]);
T* para_ptr = word2vector_neural_network->GetSensePriorParaWeights(input_layer_nodes[i]);
AddParameterRowToMultiverso(para_ptr, table_id, row_id, m_option->sense_num_multi, static_cast<real>(word2vector_neural_network->sense_prior_momentum));
blocks.push_back(ptr);
blocks.push_back(para_ptr);
}
}
for (auto& x : blocks)
free(x);
return 0;
}
template<typename T>
void Trainer<T>::SaveMultiInputEmbedding(const int epoch_id)
{
FILE* fid = nullptr;
T* sense_priors_ptr = (T*)calloc(m_option->sense_num_multi, sizeof(real));
char outfile[2000];
if (m_option->output_binary)
{
sprintf(outfile, "%s%d", m_option->binary_embedding_file, epoch_id);
fid = fopen(outfile, "wb");
fprintf(fid, "%d %d %d\n", m_dictionary->Size(), m_word_sense_info->total_senses_cnt, m_option->embeding_size);
for (int i = 0; i < m_dictionary->Size(); ++i)
{
fprintf(fid, "%s %d ", m_dictionary->GetWordInfo(i)->word.c_str(), m_word_sense_info->word_sense_cnts_info[i]);
int emb_row_id;
real emb_tmp;
if (m_word_sense_info->word_sense_cnts_info[i] > 1)
{
CopyMemory(sense_priors_ptr, GetRow<T>(kWordSensePriorTableId, m_word_sense_info->p_wordidx2sense_idx[i]), m_option->sense_num_multi);
if (!m_option->store_multinomial)
Util::SoftMax(sense_priors_ptr, sense_priors_ptr, m_option->sense_num_multi);
for (int j = 0; j < m_option->sense_num_multi; ++j)
{
fwrite(sense_priors_ptr + j, sizeof(real), 1, fid);
emb_row_id = m_word_sense_info->p_input_embedding[i] + j;
multiverso::Row<real>& embedding = GetRow<real>(kInputEmbeddingTableId, emb_row_id);
for (int k = 0; k < m_option->embeding_size; ++k)
{
emb_tmp = embedding.At(k);
fwrite(&emb_tmp, sizeof(real), 1, fid);
}
}
fprintf(fid, "\n");
}
else
{
real prob = 1.0;
fwrite(&prob, sizeof(real), 1, fid);
emb_row_id = m_word_sense_info->p_input_embedding[i];
multiverso::Row<real>& embedding = GetRow<real>(kInputEmbeddingTableId, emb_row_id);
for (int k = 0; k < m_option->embeding_size; ++k)
{
emb_tmp = embedding.At(k);
fwrite(&emb_tmp, sizeof(real), 1, fid);
}
fprintf(fid, "\n");
}
}
fclose(fid);
}
if (m_option->output_binary % 2 == 0)
{
sprintf(outfile, "%s%d", m_option->text_embedding_file, epoch_id);
fid = fopen(outfile, "w");
fprintf(fid, "%d %d %d\n", m_dictionary->Size(), m_word_sense_info->total_senses_cnt, m_option->embeding_size);
for (int i = 0; i < m_dictionary->Size(); ++i)
{
fprintf(fid, "%s %d\n", m_dictionary->GetWordInfo(i)->word.c_str(), m_word_sense_info->word_sense_cnts_info[i]);
int emb_row_id;
real emb_tmp;
if (m_word_sense_info->word_sense_cnts_info[i] > 1)
{
CopyMemory(sense_priors_ptr, GetRow<T>(kWordSensePriorTableId, m_word_sense_info->p_wordidx2sense_idx[i]), m_option->sense_num_multi);
if (!m_option->store_multinomial)
Util::SoftMax(sense_priors_ptr, sense_priors_ptr, m_option->sense_num_multi);
for (int j = 0; j < m_option->sense_num_multi; ++j)
{
fprintf(fid, "%.4f", sense_priors_ptr[j]);
emb_row_id = m_word_sense_info->p_input_embedding[i] + j;
multiverso::Row<real>& embedding = GetRow<real>(kInputEmbeddingTableId, emb_row_id);
for (int k = 0; k < m_option->embeding_size; ++k)
{
emb_tmp = embedding.At(k);
fprintf(fid, " %.3f", emb_tmp);
}
fprintf(fid, "\n");
}
}
else
{
real prob = 1.0;
fprintf(fid, "%.4f", 1.0);
emb_row_id = m_word_sense_info->p_input_embedding[i];
multiverso::Row<real>& embedding = GetRow<real>(kInputEmbeddingTableId, emb_row_id);
for (int k = 0; k < m_option->embeding_size; ++k)
{
emb_tmp = embedding.At(k);
fprintf(fid, " %.3f", emb_tmp);
}
fprintf(fid, "\n");
}
}
fclose(fid);
}
}
template<typename T>
void Trainer<T>::SaveOutputEmbedding(const int epoch_id)
{
char outfile[2000];
if (m_option->output_binary)
{
sprintf(outfile, "%s%d", m_option->outputlayer_binary_file, epoch_id);
FILE* fid = fopen(outfile, "wb");
fprintf(fid, "%d %d\n", m_dictionary->Size(), m_option->embeding_size);
for (int i = 0; i < m_dictionary->Size(); ++i)
{
multiverso::Row<real>& hs_embedding = GetRow<real>(kEmbeddingOutputTableId, i);
for (int j = 0; j < m_option->embeding_size; ++j)
{
real emb_tmp = hs_embedding.At(j);
fwrite(&emb_tmp, sizeof(real), 1, fid);
}
}
fclose(fid);
}
if (m_option->output_binary % 2 == 0)
{
sprintf(outfile, "%s%d", m_option->outputlayer_text_file, epoch_id);
FILE* fid = fopen(outfile, "w");
fprintf(fid, "%d %d\n", m_dictionary->Size(), m_option->embeding_size);
for (int i = 0; i < m_dictionary->Size(); ++i)
{
multiverso::Row<real>& hs_embedding = GetRow<real>(kEmbeddingOutputTableId, i);
for (int j = 0; j < m_option->embeding_size; ++j)
fprintf(fid, "%.2f ", hs_embedding.At(j));
fprintf(fid, "\n");
}
fclose(fid);
}
}
template<typename T>
void Trainer<T>::SaveHuffEncoder()
{
FILE* fid = fopen(m_option->huff_tree_file, "w");
fprintf(fid, "%d\n", m_dictionary->Size());
for (int i = 0; i < m_dictionary->Size(); ++i)
{
fprintf(fid, "%s", m_dictionary->GetWordInfo(i)->word.c_str());
const auto info = m_huffman_encoder->GetLabelInfo(i);
fprintf(fid, " %d", info->codelen);
for (int j = 0; j < info->codelen; ++j)
fprintf(fid, " %d", info->code[j]);
for (int j = 0; j < info->codelen; ++j)
fprintf(fid, " %d", info->point[j]);
fprintf(fid, "\n");
}
fclose(fid);
}
template class Trainer<float>;
template class Trainer<double>;

83
src/Trainer.h Normal file
Просмотреть файл

@ -0,0 +1,83 @@
#pragma once
#include <thread>
#include <chrono>
#include <multiverso.h>
#include <log.h>
#include <barrier.h>
#include "DataBlock.h"
#include "MultiversoTablesId.h"
#include "Util.h"
#include "HuffmanEncoder.h"
#include "SkipGramMixtureNeuralNetwork.h"
template<typename T>
class Trainer : public multiverso::TrainerBase
{
public:
Trainer(int trainer_id, Option *option, void** word2vector_neural_networks, multiverso::Barrier* barrier, Dictionary* dictionary, WordSenseInfo* word_sense_info, HuffmanEncoder* huff_encoder);
/*!
* /brief Train one datablock
*/
void TrainIteration(multiverso::DataBlockBase* data_block) override;
private:
int m_process_id;
int m_trainer_id;
int m_train_count; //threads count
int m_process_count; //machines count
Option *m_option;
WordSenseInfo* m_word_sense_info;
HuffmanEncoder* m_huffman_encoder;
int64_t m_word_count, m_last_word_count;
T* gamma, * fTable, *input_backup; //temp memories to store middle results in the EM algorithm
clock_t m_start_time, m_now_time, m_executive_time;
void ** m_sgmixture_neural_networks;
multiverso::Barrier *m_barrier;
Dictionary* m_dictionary;
FILE* m_log_file;
/*!
* \brief Save the multi sense input-embedding vectors
* \param epoch_id, the embedding vectors after epoch_id is dumped
*/
void SaveMultiInputEmbedding(const int epoch_id);
/*!
* \brief Save the outpue embedding vectors, i.e. the embeddings for huffman tree nodes
* \param epoch_id, the embedding vectors after epoch_id is dumped
*/
void SaveOutputEmbedding(const int epoch_id);
/*!
* \brief Save the Huffman tree structure
*/
void SaveHuffEncoder();
/*!
* \brief Copy the needed parameter from buffer to local blocks
*/
void CopyMemory(T* dest, multiverso::Row<T>& source, int size);
int CopyParameterFromMultiverso(std::vector<int>& input_layer_nodes, std::vector<int>& output_layer_nodes, void* word2vector_neural_networks);
/*!
* \brief Add delta to the parameter stored in the
* \buffer and send it to multiverso
*/
int AddParameterToMultiverso(std::vector<int>& input_layer_nodes, std::vector<int>& output_layer_nodes, void* word2vector_neural_networks);
/*!
* \brief Add delta of a row of local parameters to the parameter stored in the
* \buffer and send it to multiverso
* \param momentum: new_value = old_value * momentum + current_value * (1 - momentum). Set to non zero when updating the sense_priors
*/
void AddParameterRowToMultiverso(T* ptr, int table_id, int row_id, int size, real momentum = 0);
};

177
src/Util.cpp Normal file
Просмотреть файл

@ -0,0 +1,177 @@
#include "Util.h"
Option::Option()
{
train_file = NULL;
read_vocab_file = NULL;
binary_embedding_file = NULL;
text_embedding_file = NULL;
sw_file = NULL;
output_binary = 2;
embeding_size = 0;
thread_cnt = 1;
window_size = 5;
min_count = 5;
data_block_size = 100;
init_learning_rate = static_cast<real>(0.025);
epoch = 1;
stopwords = false;
total_words = 0;
//multisense config
store_multinomial = false;
EM_iteration = 1;
top_N = 0;
top_ratio = static_cast<real>(0.1);
sense_num_multi = 1;
init_sense_prior_momentum = static_cast<real>(0.1);
sense_file = NULL;
huff_tree_file = NULL;
outputlayer_binary_file = NULL;
outputlayer_text_file = NULL;
// multiverso config
num_servers = 0;
num_aggregator = 1;
lock_option = 1;
num_lock = 100;
max_delay = 0;
}
void Option::ParseArgs(int argc, char* argv[])
{
for (int i = 1; i < argc; i += 2)
{
if (strcmp(argv[i], "-size") == 0) embeding_size = atoi(argv[i + 1]);
if (strcmp(argv[i], "-train_file") == 0) train_file = argv[i + 1];
if (strcmp(argv[i], "-vocab_file") == 0) read_vocab_file = argv[i + 1];
if (strcmp(argv[i], "-binary") == 0) output_binary = atoi(argv[i + 1]);
if (strcmp(argv[i], "-init_learning_rate") == 0) init_learning_rate = static_cast<real>(atof(argv[i + 1]));
if (strcmp(argv[i], "-binary_embedding_file") == 0) binary_embedding_file = argv[i + 1];
if (strcmp(argv[i], "-text_embedding_file") == 0) text_embedding_file = argv[i + 1];
if (strcmp(argv[i], "-window") == 0) window_size = atoi(argv[i + 1]);
if (strcmp(argv[i], "-data_block_size") == 0) data_block_size = atoi(argv[i + 1]);
if (strcmp(argv[i], "-threads") == 0) thread_cnt = atoi(argv[i + 1]);
if (strcmp(argv[i], "-min_count") == 0) min_count = atoi(argv[i + 1]);
if (strcmp(argv[i], "-epoch") == 0) epoch = atoi(argv[i + 1]);
if (strcmp(argv[i], "-stopwords") == 0) stopwords = atoi(argv[i + 1]) != 0;
if (strcmp(argv[i], "-sw_file") == 0) sw_file = argv[i + 1];
if (strcmp(argv[i], "-num_servers") == 0) num_servers = atoi(argv[i + 1]);
if (strcmp(argv[i], "-num_aggregator") == 0) num_aggregator = atoi(argv[i + 1]);
if (strcmp(argv[i], "-lock_option") == 0) lock_option = atoi(argv[i + 1]);
if (strcmp(argv[i], "-num_lock") == 0) num_lock = atoi(argv[i + 1]);
if (strcmp(argv[i], "-max_delay") == 0) max_delay = atoi(argv[i + 1]);
if (strcmp(argv[i], "-max_preload_size") == 0) max_preload_blocks_cnt = atoi(argv[i + 1]);
if (strcmp(argv[i], "-is_pipline") == 0) pipline = atoi(argv[i + 1]) != 0;
if (strcmp(argv[i], "-sense_num_multi") == 0) sense_num_multi = atoi(argv[i + 1]);
if (strcmp(argv[i], "-momentum") == 0) init_sense_prior_momentum = static_cast<real>(atof(argv[i + 1]));
if (strcmp(argv[i], "-EM_iteration") == 0) EM_iteration = atoi(argv[i + 1]);
if (strcmp(argv[i], "-store_multinomial") == 0) store_multinomial = atoi(argv[i + 1]) != 0;
if (strcmp(argv[i], "-top_n") == 0) top_N = atoi(argv[i + 1]);
if (strcmp(argv[i], "-top_ratio") == 0) top_ratio = static_cast<real>(atof(argv[i + 1]));
if (strcmp(argv[i], "-read_sense") == 0) sense_file = argv[i + 1];
if (strcmp(argv[i], "-huff_tree_file") == 0) huff_tree_file = argv[i + 1];
if (strcmp(argv[i], "-outputlayer_binary_file") == 0) outputlayer_binary_file = argv[i + 1];
if (strcmp(argv[i], "-outputlayer_text_file") == 0) outputlayer_text_file = argv[i + 1];
}
}
void Option::PrintArgs()
{
printf("train_file: %s\n", train_file);
printf("read_vocab_file: %s\n", read_vocab_file);
printf("binary_embedding_file: %s\n", binary_embedding_file);
printf("sw_file: %s\n", sw_file);
printf("output_binary: %d\n", output_binary);
printf("stopwords: %d\n", stopwords);
printf("embeding_size: %d\n", embeding_size);
printf("thread_cnt: %d\n", thread_cnt);
printf("window_size: %d\n", window_size);
printf("min_count: %d\n", min_count);
printf("epoch: %d\n", epoch);
printf("total_words: %lld\n", total_words);
printf("init_learning_rate: %lf\n", init_learning_rate);
printf("data_block_size: %d\n", data_block_size);
printf("pre_load_data_blocks: %d\n", max_preload_blocks_cnt);
printf("num_servers: %d\n", num_servers);
printf("num_aggregator: %d\n", num_aggregator);
printf("lock_option: %d\n", lock_option);
printf("num_lock: %d\n", num_lock);
printf("max_delay: %d\n", max_delay);
printf("is_pipline:%d\n", pipline);
printf("top_ratio: %lf\n", top_ratio);
printf("top_N: %d\n", top_N);
printf("store_multinomial: %d\n", store_multinomial);
}
//Check whether the user defined arguments are valid
bool Option::CheckArgs()
{
if (!Util::IsFileExist(train_file))
{
printf("Train corpus does not exist\n");
return false;
}
if (!Util::IsFileExist(read_vocab_file))
{
printf("Vocab file does not exist\n");
return false;
}
if (output_binary && (binary_embedding_file == NULL || outputlayer_binary_file == NULL))
{
printf("Binary output file name not specified\n");
return false;
}
if (output_binary % 2 == 0 && (text_embedding_file == NULL || outputlayer_text_file == NULL))
{
printf("Text output file name not specified\n");
return false;
}
if (huff_tree_file == NULL)
{
printf("Huffman tree file name not speficied\n");
return false;
}
if (stopwords && !Util::IsFileExist(sw_file))
{
printf("Stop words file does not exist\n");
return false;
}
if (init_sense_prior_momentum < -eps || init_sense_prior_momentum >= 1)
{
printf("Init momentum %.4f out of range, must lie between 0.0 and 1.0\n", init_sense_prior_momentum);
return false;
}
if (top_ratio < -eps || top_ratio >= 1)
{
printf("Top ratio %.4f out of range, must lie between 0.0 and 1.0\n", init_sense_prior_momentum);
return false;
}
if (sense_num_multi > MAX_SENSE_CNT)
{
printf("Sense number is too big, the maximum value is 50\n");
return false;
}
if (fabs(static_cast<real>(max_delay)) > eps)
{
printf("Warning: better set max_delay to 0!\n");
}
return true;
}
bool Util::ValidF(const real &f)
{
return f < 1 || f >= 1;
}

114
src/Util.h Normal file
Просмотреть файл

@ -0,0 +1,114 @@
#pragma once
#include <fstream>
#include <cstdlib>
#include <cstring>
#include <random>
#include <cassert>
#include <exception>
#include <algorithm>
#include <unordered_map>
#include <cstdint>
typedef float real;
#define MAX_STRING 100
#define MAX_SENTENCE_LENGTH 2000
#define MAX_EXP 6
#define MAX_SENSE_CNT 50
#define MIN_LOG -15
const int table_size = (int)1e8;
const real eps = (real)1e-8;
struct WordSenseInfo
{
std::vector<int> p_input_embedding; //Points to a word's row index in kInputEmbeddingTable
std::unordered_map<int, int> p_wordidx2sense_idx; //Map a word's idx to its row index in the table kWordSensePriorTableId
std::vector<int> word_sense_cnts_info; //Record every word's #sense count information
int total_senses_cnt;
int multi_senses_words_cnt; //Total number of words with multiple senses
};
struct Option
{
const char* train_file;
const char* read_vocab_file;
const char* binary_embedding_file;
const char* text_embedding_file;
const char* sw_file;
int output_binary, stopwords;
int data_block_size;
int embeding_size, thread_cnt, window_size, min_count, epoch;
int64_t total_words;
real init_learning_rate;
int num_servers, num_aggregator, lock_option, num_lock, max_delay;
bool pipline;
int64_t max_preload_blocks_cnt;
/*Multi sense config*/
int EM_iteration;
int top_N; //The top top_N frequent words has multi senses, e.g. 500, 1000,...
real top_ratio; // The top top_ratop frequent words has multi senses, e.g. 0.05, 0.1...
int sense_num_multi; //Default number of senses for the multi_sense words
real init_sense_prior_momentum; //Initial momentum, momentum is used in updating the sense priors
bool store_multinomial; //Use multinomial parameters. If set to false, use the log of multinomial instead
const char* sense_file; //The sense file storing (word, #sense) mapping
const char* huff_tree_file; // The output file storing the huffman tree structure
const char* outputlayer_binary_file; //The output binary file storing all the output embedding(i.e. the huffman node embedding)
const char* outputlayer_text_file; //The output text file storing all the output embedding(i.e. the huffman node embedding)
Option();
void ParseArgs(int argc, char* argv[]);
void PrintArgs();
bool CheckArgs();
};
class Util
{
public:
static void SaveVocab();
template<typename T>
static T InnerProduct(T* x, T* y, int length)
{
T result = 0;
for (int i = 0; i < length; ++i)
result += x[i] * y[i];
return result;
}
static bool ValidF(const real &f);
template <typename T>
static T Sigmoid(T f)
{
if (f < -MAX_EXP)
return 0;
if (f > MAX_EXP)
return 1;
return 1 / (1 + exp(-f));
}
template <typename T>
static void SoftMax(T* s, T* result, int size)
{
T sum = 0, max_v = s[0];
for (int j = 1; j < size; ++j)
max_v = std::max(max_v, s[j]);
for (int j = 0; j < size; ++j)
sum += exp(s[j] - max_v);
for (int j = 0; j < size; ++j)
result[j] = exp(s[j] - max_v) / sum;
}
static bool IsFileExist(const char *fileName)
{
std::ifstream infile(fileName);
return infile.good();
}
};