(a) Fully templatized the CTC implementation

(b) Added numerical gradient support
(c) Lots of debig messages [controlled by #ifdef]
(d) Specific bug fix in CtcError calculation: log was getting applied twice on probabilities (causing underflows).
This commit is contained in:
Hari Parthasarathi 2015-08-10 19:37:00 -07:00
Родитель 04e5cdd823
Коммит 53ad7080cd
2 изменённых файлов: 292 добавлений и 55 удалений

Просмотреть файл

@ -14,49 +14,45 @@ namespace Microsoft { namespace MSR { namespace CNTK {
kCopyData
} MatrixResizeType;
struct NumericLimits
{
static const float log_zero_ = -1e100;
static const float exp_limit_ = 709.78271289338397;
static const float log_inf_ = 1e100;
static const float max_ = 1.7976931348623157e+308;
};
// a + b, where a and b are assumed to be in the log scale
float AddAB(float a, float b)
template<class ElemType>
ElemType CtcTrainingIO<ElemType>::AddAB(ElemType a, ElemType b)
{
if (a == NumericLimits::log_zero_ || b == NumericLimits::log_zero_)
return NumericLimits::log_zero_;
if (a == log_zero_ || b == log_zero_)
return log_zero_;
else
return a + b;
}
// a - b, where a and b are assumed to be in the log scale
float SubAB(float a, float b)
template<class ElemType>
ElemType CtcTrainingIO<ElemType>::SubAB(ElemType a, ElemType b)
{
if (a == NumericLimits::log_zero_)
return NumericLimits::log_zero_;
else if (b == NumericLimits::log_zero_)
return NumericLimits::log_inf_;
if (a == log_zero_)
return log_zero_;
else if (b == log_zero_)
return log_inf_;
else
return a - b;
}
// exp(a)
float ExpA(float a)
template<class ElemType>
ElemType CtcTrainingIO<ElemType>::ExpA(ElemType a)
{
if (a <= NumericLimits::log_zero_)
if (a <= log_zero_)
return 0;
else if (a >= NumericLimits::exp_limit_)
return NumericLimits::max_;
else if (a >= exp_limit_)
return max_;
else
return exp(a);
}
// Approximation of log(a + b) = log(a) + log(1 + b/a), if b < a
// = log(b) + log(1 + a/b), if a < b
float LogAPlusB(float a, float b) // x and y are in log scale and so is the result
template<class ElemType>
ElemType CtcTrainingIO<ElemType>::LogAPlusB(ElemType a, ElemType b) // x and y are in log scale and so is the result
{
if (b < a)
return AddAB(a, log(1 + ExpA(SubAB(b, a))));
@ -110,13 +106,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (i < 2)
alpha(row,i) = prob(row, idxProb);
else
alpha(row,i) = NumericLimits::log_zero_;
alpha(row,i) = log_zero_;
} else {
if (i > 1) {
if (i % 2 == 0 || labels[i-2] == labels[i]) {
alpha(row,i) = AddAB(prob(row, idxProb), LogAPlusB(alpha(row-1, i-1), alpha(row-1, i)));
} else {
float tmp = LogAPlusB(alpha(row-1, i-1), alpha(row-1, i));
ElemType tmp = LogAPlusB(alpha(row-1, i-1), alpha(row-1, i));
alpha(row,i) = AddAB(prob(row, idxProb), LogAPlusB(alpha(row-1, i-2), tmp));
}
} else if (i == 1) {
@ -146,13 +142,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (i > dim - 3)
beta(row,i) = prob(row,idxProb);
else
beta(row,i) = NumericLimits::log_zero_;
beta(row,i) = log_zero_;
} else {
if (i < dim - 2) {
if (i % 2 == 0 || labels[i+2] == labels[i]) {
beta(row,i) = AddAB(prob(row,idxProb), LogAPlusB(beta(row+1,i+1), beta(row+1,i)));
} else {
float tmp = LogAPlusB(beta(row+1,i+1), beta(row+1,i));
ElemType tmp = LogAPlusB(beta(row+1,i+1), beta(row+1,i));
beta(row,i) = AddAB(prob(row,idxProb), LogAPlusB(beta(row+1,i+2), tmp));
}
} else if (i == dim - 2) {
@ -171,27 +167,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Matrix<ElemType> &beta,
Matrix<ElemType> &log_nnet_out,
std::vector<size_t> &labels,
float pzx) {
ElemType pzx) {
int dim_error_rows = ctc_err.GetNumRows();
int dim_error_cols = ctc_err.GetNumCols();
int label_size = labels.size();
for(int i=0; i<dim_error_rows; i++) {
for(int j=0; j<dim_error_cols; j++) {
float err = NumericLimits::log_zero_;
ElemType err = log_zero_;
for(int s = 0; s < label_size; s++) {
if (labels[s] == j) { //
if (labels[s] == j) {
err = LogAPlusB(err, AddAB(alpha(i,s), beta(i,s)));
}
}
float val = ExpA(SubAB(err, AddAB(pzx, ExpA(log_nnet_out(i,j)) == 0? NumericLimits::log_zero_ : 2*ExpA(log_nnet_out(i,j)))));
ElemType val = ExpA(SubAB(err, AddAB(pzx, ExpA(log_nnet_out(i,j)) == 0? log_zero_ : 2*log_nnet_out(i,j))));
ctc_err(i,j) = -1.0 * val;
}
}
}
#define DEBUG_UTTERANCE (0)
template<class ElemType>
bool CtcTrainingIO<ElemType>::ComputeDerivative(const wstring& uttID,
bool CtcTrainingIO<ElemType>::ComputeDerivativeActual(const wstring& uttID,
const Matrix<ElemType>& logLikelihoodIn,
Matrix<ElemType>* derivative,
ElemType* objective)
@ -204,7 +202,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Matrix<ElemType> nnet_out(CPUDEVICE); // posterior matrix
nnet_out.Resize(log_nnet_out.GetNumRows(), log_nnet_out.GetNumCols());
for(int i =0;i<log_nnet_out.GetNumRows();i++) {
float row_sum=0;
ElemType row_sum=0;
for(int j=0; j<log_nnet_out.GetNumCols();j++) {
nnet_out(i,j) = ExpA(log_nnet_out(i,j));
row_sum = row_sum + nnet_out(i,j);
@ -218,6 +216,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
#if(DEBUG_UTTERANCE)
FILE * pFile=0;
pFile = fopen ("debug/posterior.mat.txt","w");
if (pFile!=NULL)
{
for(int i =0;i<nnet_out.GetNumRows();i++) {
for(int j=0; j<nnet_out.GetNumCols();j++) {
fprintf(pFile, "%f ", nnet_out(i,j));
}
fprintf(pFile, "\n");
}
fclose (pFile);
}
#endif
std::string uttIDStr = msra::asr::toStr(uttID);
size_t num_frames = log_nnet_out.GetNumRows();
@ -258,51 +271,248 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (int t = (num_frames - 1); t >= 0; t--) {
ComputeCtcLatticeBackward(beta, log_nnet_out, t, label_expand);
}
#if(DEBUG_UTTERANCE)
pFile = fopen ("debug/alpha.mat.txt","w");
if (pFile!=NULL)
{
for(int i =0;i<alpha.GetNumRows();i++) {
for(int j=0; j<alpha.GetNumCols();j++) {
fprintf(pFile, "%f ", alpha(i,j));
}
fprintf(pFile, "\n");
}
fclose (pFile);
}
pFile = fopen ("debug/beta.mat.txt","w");
if (pFile!=NULL)
{
for(int i =0;i<beta.GetNumRows();i++) {
for(int j=0; j<beta.GetNumCols();j++) {
fprintf(pFile, "%f ", beta(i,j));
}
fprintf(pFile, "\n");
}
fclose (pFile);
}
#endif
// compute the log-likelihood of the label sequence given the inputs logP(z|x)
float tmp1 = alpha(num_frames-1, exp_len_labels-1);
float tmp2 = alpha(num_frames-1, exp_len_labels-2);
float pzx = tmp1 + log(1 + ExpA(tmp2 - tmp1));
ElemType tmp1 = alpha(num_frames-1, exp_len_labels-1);
ElemType tmp2 = alpha(num_frames-1, exp_len_labels-2);
ElemType pzx = tmp1 + log(1 + ExpA(tmp2 - tmp1));
// compute the errors
Matrix<ElemType> ctc_err(CPUDEVICE); // error matrix
ctc_err.Resize(num_frames, num_classes);
ComputeCtcError(ctc_err, alpha, beta, log_nnet_out, label_expand, pzx);
#if(DEBUG_UTTERANCE)
printf("\nPzx=%f\n",pzx);
pFile = fopen ("debug/ctc_error.mat.txt","w");
if (pFile!=NULL)
{
for(int i =0;i<ctc_err.GetNumRows();i++) {
for(int j=0; j<ctc_err.GetNumCols();j++) {
fprintf(pFile, "%f ", ctc_err(i,j));
}
fprintf(pFile, "\n");
}
fclose (pFile);
}
#endif
// back-propagate the errors through the softmax layer
/*
Matrix<ElemType> nnet_out(log_nnet_out); // posterior matrix
for(int i =0;i<log_nnet_out.GetNumRows();i++) {
for(int j=0; j<log_nnet_out.GetNumCols();j++) {
nnet_out(i,j) = ExpA(log_nnet_out(i,j));
std::vector<ElemType> row_sum;
row_sum.resize(num_frames, 0);
for(int i =0;i<ctc_err.GetNumRows();i++) {
for(int j=0; j<ctc_err.GetNumCols();j++) {
ctc_err(i,j) = ctc_err(i,j) * nnet_out(i,j);
row_sum[i] = row_sum[i] + ctc_err(i,j);
}
}
*/
ctc_err.ElementMultiplyWith(nnet_out);
Matrix<ElemType> row_sum(CPUDEVICE);
row_sum.Resize(1, num_frames);
Matrix<ElemType>::VectorSum(ctc_err, row_sum, false);
Matrix<ElemType> net_out_tmp(nnet_out);
net_out_tmp.ColumnElementMultiplyWith(row_sum);
for(int i =0;i<net_out_tmp.GetNumRows();i++) {
ElemType scale = row_sum[i];
for(int j=0; j<net_out_tmp.GetNumCols();j++) {
net_out_tmp(i,j) = net_out_tmp(i,j) * scale;
}
}
Matrix<ElemType> diff(ctc_err);
diff = net_out_tmp - diff;
//TODO: this is not the correct posterior format
//Set the correct posterior format
diff = diff - net_out_tmp;
*derivative = diff.Transpose();
#if(DEBUG_UTTERANCE)
pFile = fopen ("debug/gradient.mat.txt","w");
if (pFile!=NULL)
{
for(int i =0;i<diff.GetNumRows();i++) {
for(int j=0; j<diff.GetNumCols();j++) {
fprintf(pFile, "%f ", diff(i,j));
}
fprintf(pFile, "\n");
}
fclose (pFile);
}
#endif
//Set the objective
*objective = logLikelihoodIn.GetNumCols() - pzx;
*objective = pzx;
assert(derivative->GetNumCols() == logLikelihoodIn.GetNumCols());
m_currentUttID = uttID;
return true;
}
template<class ElemType>
bool CtcTrainingIO<ElemType>::ComputeDerivativeNumerical(const wstring& uttID,
const Matrix<ElemType>& logLikelihoodIn,
Matrix<ElemType>* derivative,
ElemType* objective)
{
ElemType eps = 0.00001;
Matrix<ElemType> diff(CPUDEVICE);
diff.Resize(logLikelihoodIn.GetNumCols(), logLikelihoodIn.GetNumRows());
diff.SetValue(kSetZero);
for(int m=0;m<logLikelihoodIn.GetNumCols();m++) {
for(int n=0; n<logLikelihoodIn.GetNumRows();n++) {
ElemType gradElt=0;
for(int dir=0; dir<2; dir++) {
//transpose the matrix so that it is in kaldi format
Matrix<ElemType> log_nnet_out(logLikelihoodIn.Transpose());
if (log_nnet_out.GetDeviceId() >= 0)
log_nnet_out.TransferFromDeviceToDevice(log_nnet_out.GetDeviceId(), CPUDEVICE, true, false, false);
log_nnet_out(m,n) = log_nnet_out(m,n) + ((dir*2) - 1) * eps;
Matrix<ElemType> nnet_out(CPUDEVICE); // posterior matrix
nnet_out.Resize(log_nnet_out.GetNumRows(), log_nnet_out.GetNumCols());
for(int i =0;i<log_nnet_out.GetNumRows();i++) {
ElemType row_sum=0;
for(int j=0; j<log_nnet_out.GetNumCols();j++) {
nnet_out(i,j) = ExpA(log_nnet_out(i,j));
row_sum = row_sum + nnet_out(i,j);
}
for(int j=0; j<log_nnet_out.GetNumCols();j++) {
nnet_out(i,j) = nnet_out(i,j)/row_sum;
}
for(int j=0; j<log_nnet_out.GetNumCols();j++) {
assert(nnet_out(i,j) >= 0.0);
log_nnet_out(i,j) = log(nnet_out(i,j));
}
}
std::string uttIDStr = msra::asr::toStr(uttID);
size_t num_frames = log_nnet_out.GetNumRows();
size_t num_classes = log_nnet_out.GetNumCols();
// Check if the label sequence for an utterance is available.
// and if so read it
if (!m_labRspecifier->HasKey(uttIDStr))
RuntimeError("Label not found for utterance %s\n", uttIDStr.c_str());
const std::vector<int32> label = m_labRspecifier->Value(uttIDStr);
// label expansion by inserting blank (indexed by 0) at the beginning and end,
// and between every pair of labels
size_t len_labels = label.size();
size_t exp_len_labels = 2*len_labels + 1;
// this code fills up the label vector with 0
// Nonspeech phones are assumed to be >= 1
std::vector<size_t> label_expand; // the label vector as a matrix
label_expand.resize(0);
label_expand.resize(exp_len_labels, 0);
for (int l = 0; l < len_labels; l++) {
label_expand[2*l+1] = label[l];
}
//define matrices for the forward backward computation
Matrix<ElemType> alpha(CPUDEVICE); // alpha matrix
alpha.Resize(num_frames, exp_len_labels);
alpha.SetValue(kSetZero);
for (size_t t = 0; t < num_frames; t++) {
ComputeCtcLatticeForward(alpha, log_nnet_out, t, label_expand);
}
// compute the log-likelihood of the label sequence given the inputs logP(z|x)
ElemType tmp1 = alpha(num_frames-1, exp_len_labels-1);
ElemType tmp2 = alpha(num_frames-1, exp_len_labels-2);
ElemType pzx = tmp1 + log(1 + ExpA(tmp2 - tmp1));
gradElt = gradElt + pzx * ((dir*2) - 1);
m_currentUttID = uttID;
//Set the objective
*objective = pzx;
}
diff(m,n) = gradElt/(2*eps);
}
}
*derivative = diff.Transpose();
assert(derivative->GetNumCols() == logLikelihoodIn.GetNumCols());
return true;
}
#define PRINT_GRAD (0)
#define ACTUAL_GRAD (1)
template<class ElemType>
bool CtcTrainingIO<ElemType>::ComputeDerivative(const wstring& uttID,
const Matrix<ElemType>& logLikelihoodIn,
Matrix<ElemType>* derivative,
ElemType* objective)
{
#if(ACTUAL_GRAD)
bool ret = ComputeDerivativeActual(uttID,
logLikelihoodIn,
derivative,
objective);
#else
bool ret = ComputeDerivativeNumerical(uttID,
logLikelihoodIn,
derivative,
objective);
#endif
#if(PRINT_GRAD)
/* BEGIN: print gradients.
*
*/
printf("\n\n=====================================================");
printf("\nPrint (likelihood, gradients).\n");
printf("\nObjective = %f \n", *objective);
printf("=====================================================\n");
Matrix<ElemType> log_nnet_dup(logLikelihoodIn.Transpose());
if (log_nnet_dup.GetDeviceId() >= 0)
log_nnet_dup.TransferFromDeviceToDevice(log_nnet_dup.GetDeviceId(), CPUDEVICE, true, false, false);
for(int i =0;i<derivative->GetNumRows();i++) {
for(int j=0; j<derivative->GetNumCols();j++) {
printf("(%f, %f)", logLikelihoodIn(i,j), (*derivative)(i,j));
}
printf("\n");
}
printf("=====================================================\n\n");
/*
END: print gradients.
*/
#endif
return ret;
}
template<class ElemType>
bool CtcTrainingIO<ElemType>::HasResourceForDerivative(
const wstring& uttID) const

Просмотреть файл

@ -7,8 +7,6 @@
namespace Microsoft { namespace MSR { namespace CNTK {
// This class deals with the CTC training in CNTK.
//TODO: this code needs to inherit from a parallel fwd/bwd propogation class:
// -> but note that fwd/bwd on lattice would still be single threaded.
template<class ElemType>
class CtcTrainingIO :
public UtteranceDerivativeComputationInterface<ElemType>
@ -51,7 +49,36 @@ protected:
Matrix<ElemType> &beta,
Matrix<ElemType> &log_nnet_out,
std::vector<size_t> &labels,
float pzx);
ElemType pzx);
virtual bool ComputeDerivativeActual(const wstring& uttID,
const Matrix<ElemType>& logLikelihoodIn,
Matrix<ElemType>* derivative,
ElemType* objective);
virtual bool ComputeDerivativeNumerical(const wstring& uttID,
const Matrix<ElemType>& logLikelihoodIn,
Matrix<ElemType>* derivative,
ElemType* objective);
/*
static const ElemType log_zero_ = -1e100;
static const ElemType exp_limit_ = 709.78271289338397;
static const ElemType log_inf_ = 1e100;
static const ElemType max_ = 1.7976931348623157e+308;
*/
static const ElemType log_zero_ = -1e30f;
static const ElemType exp_limit_ = 88.722839f;
static const ElemType log_inf_ = 1e30f;
static const ElemType max_ = 3.4028235e+038f;
ElemType AddAB(ElemType a, ElemType b);
ElemType SubAB(ElemType a, ElemType b);
ElemType ExpA(ElemType a);
ElemType LogAPlusB(ElemType a, ElemType b);
};
}}}