Adding the speech demo example

This commit is contained in:
Malcolm Slaney 2014-11-22 15:37:09 -08:00
Родитель 8b9f46d74e
Коммит efdf17b573
15 изменённых файлов: 451607 добавлений и 0 удалений

3696
Demos/Speech/CntkTimit.scp Executable file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

3696
Demos/Speech/CntkTimitOutput.scp Executable file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

184
Demos/Speech/ComputeConfusion.m Executable file
Просмотреть файл

@ -0,0 +1,184 @@
function confusionData = ComputeConfusion(mlfFile)
% function confusionData = ComputeConfusion(mlfFile)
% Compute all the confusions for one experiment. Read in the TIMIT MLF file
% so we know which utterances we have. For each utterance, read in the
% CNTK output and compute the confusion matrix. Sum them all together.
if nargin < 1
mlfFile = 'TimitLabels.mlf';
end
%%
% Parse the Timit MLF file because it tells us the true phonetic labels
% for each segment of each utterance.
fp = fopen(mlfFile,'r');
segmentLabels = [];
scale=1e-7;
numberOfUtterances = 0;
confusionData = 0;
while 1
theLine = fgets(fp);
if isempty(theLine) || theLine(1) == -1
break;
end
if strncmp(theLine, '#!MLF!#', 7)
continue; % Ignore the header
end
if theLine(1) == '"' % Look for file name indication
numberOfUtterances = numberOfUtterances + 1;
fileName = strtok(theLine);
fileName = fileName(2:end-1);
segmentLabels = [];
end
if theLine(1) >= '0' && theLine(1) <= '9'
% Got a speech segment with times and phoneme label. Parse it.
c = textscan(theLine, '%d %d %s ');
b = double(c{1}(1)); e = double(c{2}); l = c{3}{1};
if isempty(segmentLabels)
clear segmentLabels;
segmentLabels(1000) = struct('begin', b, 'end', e, 'label', l);
segmentCount = 0;
end
segmentCount = segmentCount + 1;
% Add a new entry in the list of segments.
segmentLabels(segmentCount) = struct('begin', b*scale, 'end', e*scale, 'label', l);
end
if theLine(1) == '.'
% Found the end of the speech transcription. Process the new data.
c = ComputeConfusionOnce(fileName, segmentLabels(1:segmentCount));
confusionData = confusionData + c;
segmentLabels = [];
end
end
fclose(fp);
function Confusions = ComputeConfusionOnce(utteranceName, segmentLabels)
% function Confusions = ComputeConfusionOnce(utteranceName, labelData)
% Compute the confusion matrix for one TIMIT utterance. This routine takes
% the segment data (from the TIMIT label file) and a feature-file name. It
% transforms the feature file into a CNTK output file. It reads in the
% CNTK output file, and tabulates a confusion matrix. We do this one
% segment at a time, since TIMIT segments are variable length, and the CNTK
% output is sampled at regular intervals (10ms).
likelihoodName = strrep(strrep(utteranceName, 'Features/', 'Output/'), ...
'fbank_zda', 'log');
try
[likelihood,~] = htkread(likelihoodName);
catch me
fprintf('Can''t read %s using htkread. Ignoring.\n', likelihoodName);
Confusions = 0;
return
end
nStates = 183; % Preordained.
frameRate = 100; % Preordained
Confusions = zeros(nStates, nStates);
for i=1:size(segmentLabels, 2)
% Go through each entry in the MLF file for one utterance. Each entry
% lists the beginning and each of each speech state.
% Compare the true label with the winner of the maximum likelihood from
% CNTK.
beginIndex = max(1, round(segmentLabels(i).begin*frameRate));
endIndex = min(size(likelihood,1), round(segmentLabels(i).end*frameRate));
curIndices = beginIndex:endIndex;
[~,winners] = max(likelihood(curIndices,:),[], 2);
correctLabel = FindLabelNumber(segmentLabels(i).label);
for w=winners(:)' % increment one at a time
Confusions(correctLabel, w) = Confusions(correctLabel, w) + 1;
end
end
function labelNumber = FindLabelNumber(labelName)
% For each label name, turn the name into an index. The labels are listed,
% in order, in the TimitStateList file.
persistent stateList
if isempty(stateList)
stateList = ReadStateList('TimitStateList.txt');
end
for labelNumber=1:size(stateList,1)
if strcmp(labelName, stateList{labelNumber})
return;
end
end
labelNumber = [];
function stateList = ReadStateList(stateListFile)
% Read in the state list file. This file contains an ordered list of
% states, each corresponding to one label (and one output in the CNTK
% network.)
fp = fopen(stateListFile);
nStates = 183; % Preordained
stateList = cell(nStates, 1);
stateIndex = 1;
while true
theLine = fgets(fp);
if isempty(theLine) || theLine(1) == -1
break;
end
stateList{stateIndex} = theLine(1:end-1);
stateIndex = stateIndex + 1;
end
fclose(fp);
function [ DATA, HTKCode ] = htkread( Filename )
% [ DATA, HTKCode ] = htkread( Filename )
%
% Read DATA from possibly compressed HTK format file.
%
% Filename (string) - Name of the file to read from
% DATA (nSamp x NUMCOFS) - Output data array
% HTKCode - HTKCode describing file contents
%
% Compression is handled using the algorithm in 5.10 of the HTKBook.
% CRC is not implemented.
%
% Mark Hasegawa-Johnson
% July 3, 2002
% Based on function mfcc_read written by Alexis Bernard
% Found at: https://raw.githubusercontent.com/ronw/matlab_htk/master/htkread.m
%
fid=fopen(Filename,'r','b');
if fid<0,
error(sprintf('Unable to read from file %s',Filename));
end
% Read number of frames
nSamp = fread(fid,1,'int32');
% Read sampPeriod
sampPeriod = fread(fid,1,'int32');
% Read sampSize
sampSize = fread(fid,1,'int16');
% Read HTK Code
HTKCode = fread(fid,1,'int16');
%%%%%%%%%%%%%%%%%
% Read the data
if bitget(HTKCode, 11),
DIM=sampSize/2;
nSamp = nSamp-4;
%disp(sprintf('htkread: Reading %d frames, dim %d, compressed, from %s',nSamp,DIM,Filename));
% Read the compression parameters
A = fread(fid,[1 DIM],'float');
B = fread(fid,[1 DIM],'float');
% Read and uncompress the data
DATA = fread(fid, [DIM nSamp], 'int16')';
DATA = (repmat(B, [nSamp 1]) + DATA) ./ repmat(A, [nSamp 1]);
else
DIM=sampSize/4;
%disp(sprintf('htkread: Reading %d frames, dim %d, uncompressed, from %s',nSamp,DIM,Filename));
% If not compressed: Read floating point data
DATA = fread(fid, [DIM nSamp], 'float')';
end
fclose(fid);

17
Demos/Speech/HCopyTimit.config Executable file
Просмотреть файл

@ -0,0 +1,17 @@
SOURCEKIND = WAVEFORM
SOURCEFORMAT = NIST
SAVECOMPRESSED = FALSE
SAVEWITHCRC = FALSE
TARGETKIND = FBANK_Z_D_A
TARGETRATE = 100000
SOURCERATE = 625
WINDOWSIZE = 250000.0
PREEMCOEF = 0.97
ZMEANSOURCE = TRUE
USEHAMMING = TRUE
NUMCHANS = 24
ENORMALISE = TRUE
ESCALE = 1.0
LOFREQ = 64
HIFREQ = 8000

3696
Demos/Speech/HCopyTimit.scp Executable file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

77
Demos/Speech/ShowConfusions.m Executable file
Просмотреть файл

@ -0,0 +1,77 @@
function ShowConfusions(confusionData, squeeze)
% function ShowConfusions(confusionData)
% Average the three-state confusion data into monophone confusions. Then
% display the data. A graphical interface lets you interrogate the data,
% by moving the mouse, and clicking at various points. The phonetic labels
% are shown on the graph.
confusionSmall = ( ...
confusionData(1:3:end,1:3:end) + confusionData(2:3:end, 1:3:end) + confusionData(3:3:end, 1:3:end) + ...
confusionData(1:3:end,2:3:end) + confusionData(2:3:end, 2:3:end) + confusionData(3:3:end, 2:3:end) + ...
confusionData(1:3:end,3:3:end) + confusionData(2:3:end, 3:3:end) + confusionData(3:3:end, 3:3:end))/9;
if nargin < 2
squeeze = 1;
end
imagesc(confusionSmall .^ squeeze)
axis ij
axis square
ylabel('True Label');
xlabel('CNTK Prediction');
%%
stateList = ReadStateList();
h = [];
fprintf('Select a point with the mouse, type return to end...\n');
while true
[x,y] = ginput(1);
if isempty(x) || isempty(y)
break;
end
if ~isempty(h)
delete(h);
h = [];
end
try
trueLabel = stateList{(round(x)-1)*3+1};
catch
trueLabel = 'Unknown';
end
try
likelihoodLabel = stateList{(round(y)-1)*3+1};
catch
likelihoodLabel = 'Unknown';
end
h = text(40, -2, sprintf('%s -> %s', trueLabel, likelihoodLabel));
% h = text(40, -2, sprintf('%g -> %g', x, y));
end
function stateList = ReadStateList(stateListFile)
% Read in the state list file. This file contains an ordered list of
% states, each corresponding to one label (and one output in the CNTK
% network.)
if nargin < 1
stateListFile = 'TimitStateList.txt';
end
% Read in the state list file.
fp = fopen(stateListFile);
nStates = 183; % Preordained
stateList = cell(nStates, 1);
stateIndex = 1;
while true
theLine = fgets(fp);
if isempty(theLine) || theLine(1) == -1
break;
end
f = find(theLine == '_');
if ~isempty(f)
label = theLine(1:f(1)-1);
else
label = theLine(1:end-1);
end
stateList{stateIndex} = label;
stateIndex = stateIndex + 1;
end
fclose(fp);

66
Demos/Speech/TimitGetFiles.py Executable file
Просмотреть файл

@ -0,0 +1,66 @@
#!python
#
# Syntax: python TimitGetFiles.py TIMIT_base_directory_path
#
# Simple script to take the list of valid TIMIT utterances, and turn them into
# the script files needed for HCopy and CNTK.
#
# This program reads the TimitSubjectList.txt file, which is included in the
# demo distribution. It then creates three files:
# HCopyTimit.scp - script that converts MFCC to audio features
# CntkTimit.scp - list of files that are read by CNTK for training
# CntkTimitOutput.scp - list of output files for CNTK (likelhihood scores)
# All these files can be edited by hand if you don't want to run this python
# script.
import os, sys
# Set up the base directory name. This will be prepended to each file name
# so hcopy knows where to find the wave files.
if len(sys.argv) > 1:
baseDir = sys.argv[1]
if os.path.isdir(baseDir) == False:
print "Can't find TIMIT base directory: " + baseDir
sys.exit(1)
else:
print "Syntax: " + sys.argv[0] + " TIMIT_base_directory_path"
sys.exit(1)
if not baseDir.endswith('\\') and not baseDir.endswith('/'):
baseDir += '/'
hcopyScript = 'HCopyTimit.scp'
cnScript = 'CntkTimit.scp'
cnOutputScript = 'CntkTimitOutput.scp'
hcopyScriptFp = open(hcopyScript, 'w')
cnScriptFp = open(cnScript, 'w')
cnOutputScriptFp = open(cnOutputScript, 'w')
fileCount = 0
fileList = 'TimitSubjectList.txt'
fileListFp = open(fileList)
if !hcopyScript or !cnScriptFp or !cnOutputScriptFp or !fileListFp:
print "Can't open the necessary output files.
sys.exit(0)
for origFile in fileListFp:
origFile = origFile.strip()
fullFile = baseDir + origFile
# Flatten the output structure. Replace / with -
outFile = origFile.replace('/', '-').replace('\\', '-')
featFile = 'Features/train-' + outFile
hcopyScriptFp.write(fullFile+'.nst ' + featFile+'.fbank_zda\n')
cnScriptFp.write(featFile+'.fbank_zda\n')
cnOutputScriptFp.write('Output/train-'+outFile+'.log\n')
fileCount += 1
if fileCount > 100000000: # Debugging
break
fileListFp.close()
hcopyScriptFp.close()
cnScriptFp.close()
cnOutputScriptFp.close()

436123
Demos/Speech/TimitLabels.mlf Executable file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

183
Demos/Speech/TimitStateList.txt Executable file
Просмотреть файл

@ -0,0 +1,183 @@
aa_s2
aa_s3
aa_s4
ae_s2
ae_s3
ae_s4
ah_s2
ah_s3
ah_s4
ao_s2
ao_s3
ao_s4
aw_s2
aw_s3
aw_s4
ax_s2
ax_s3
ax_s4
axh_s2
axh_s3
axh_s4
axr_s2
axr_s3
axr_s4
ay_s2
ay_s3
ay_s4
b_s2
b_s3
b_s4
bcl_s2
bcl_s3
bcl_s4
ch_s2
ch_s3
ch_s4
d_s2
d_s3
d_s4
dcl_s2
dcl_s3
dcl_s4
dh_s2
dh_s3
dh_s4
dx_s2
dx_s3
dx_s4
eh_s2
eh_s3
eh_s4
el_s2
el_s3
el_s4
em_s2
em_s3
em_s4
en_s2
en_s3
en_s4
eng_s2
eng_s3
eng_s4
epi_s2
epi_s3
epi_s4
er_s2
er_s3
er_s4
ey_s2
ey_s3
ey_s4
f_s2
f_s3
f_s4
g_s2
g_s3
g_s4
gcl_s2
gcl_s3
gcl_s4
h#_s2
h#_s3
h#_s4
hh_s2
hh_s3
hh_s4
hv_s2
hv_s3
hv_s4
ih_s2
ih_s3
ih_s4
ix_s2
ix_s3
ix_s4
iy_s2
iy_s3
iy_s4
jh_s2
jh_s3
jh_s4
k_s2
k_s3
k_s4
kcl_s2
kcl_s3
kcl_s4
l_s2
l_s3
l_s4
m_s2
m_s3
m_s4
n_s2
n_s3
n_s4
ng_s2
ng_s3
ng_s4
nx_s2
nx_s3
nx_s4
ow_s2
ow_s3
ow_s4
oy_s2
oy_s3
oy_s4
p_s2
p_s3
p_s4
pau_s2
pau_s3
pau_s4
pcl_s2
pcl_s3
pcl_s4
q_s2
q_s3
q_s4
r_s2
r_s3
r_s4
s_s2
s_s3
s_s4
sh_s2
sh_s3
sh_s4
t_s2
t_s3
t_s4
tcl_s2
tcl_s3
tcl_s4
th_s2
th_s3
th_s4
uh_s2
uh_s3
uh_s4
uw_s2
uw_s3
uw_s4
ux_s2
ux_s3
ux_s4
v_s2
v_s3
v_s4
w_s2
w_s3
w_s4
y_s2
y_s3
y_s4
z_s2
z_s3
z_s4
zh_s2
zh_s3
zh_s4

3696
Demos/Speech/TimitSubjectList.txt Executable file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,156 @@
command=TimitTrainSimple:TimitTestSimple:TimitWriteSimple
# command=TimitWriteSimple
# deviceId=-1 for CPU, >=0 for GPU devices
DeviceNumber=0
stderr=Demo
precision=float
#######################################
# TRAINING CONFIG (Simple, Fixed LR) #
#######################################
TimitTrainSimple=[
action=train
modelPath=Models/TrainSimple.dnn
# deviceId=-1 for CPU, >=0 for GPU devices
deviceId=$DeviceNumber$
traceLevel=1
# Notation xxx:yyy*n:zzz is equivalent to xxx, then yyy repeated n times,
# then zzz
# example: 10:20*3:5 is equivalent to 10:20:20:20:5
SimpleNetworkBuilder=[
layerSizes=792:512*3:183
trainingCriterion=CrossEntropyWithSoftmax
evalCriterion=ErrorPrediction
layerTypes=Sigmoid
initValueScale=1.0
applyMeanVarNorm=true
uniformInit=true
needPrior=true
]
SGD=[
# epochSize=0 means epochSize is the size of the training set
epochSize=0
minibatchSize=256:1024
learningRatesPerMB=0.8:3.2*14:0.08
momentumPerMB=0.9
dropoutRate=0.0
# maxEpochs=25 # Full experiment
maxEpochs=1 # Small experiment for debugging
]
# Parameter values for the reader
reader=[
# reader to use
readerType=HTKMLFReader
# readMethod=blockRandomize
readMethod=rollingWindow
miniBatchMode=Partial
randomize=Auto
verbosity=1
features=[
dim=792
scpFile=CntkTimit.scp
]
labels=[
mlfFile=TimitLabels.mlf
labelDim=183
labelMappingFile=TimitStateList.txt
]
]
]
#######################################
# TESTING CONFIG (Simple) #
#######################################
TimitTestSimple=[
action=test
modelPath=Models/TrainSimple.dnn
# deviceId=-1 for CPU, >=0 for GPU devices
deviceId=$DeviceNumber$
traceLevel=1
# epochSize=0 means epochSize is the size of the training set
epochSize=0
minibatchSize=256:1024
# Parameter values for the reader
reader=[
# reader to use
readerType=HTKMLFReader
readMethod=rollingWindow
miniBatchMode=Partial
randomize=Auto
verbosity=1
# numMBsToShowResult=10
# evalNodeNames
features=[
dim=792
scpFile=CntkTimit.scp
]
labels=[
mlfFile=TimitLabels.mlf
labelDim=183
labelMappingFile=TimitStateList.txt
]
]
]
#######################################
# OUTPUT RESUTLS (Simple) #
#######################################
TimitWriteSimple=[
action=write
modelPath=Models/TrainSimple.dnn
# deviceId=-1 for CPU, >=0 for GPU devices
deviceId=$DeviceNumber$
outputNodeNames=ScaledLogLikelihood
traceLevel=1
# Parameter values for the reader
reader=[
# reader to use
readerType=HTKMLFReader
features=[
dim=792
scpFile=CntkTimit.scp
]
]
writer=[
wrecords=50000 #Number of records to allocate space for in file
writerType=HTKMLFWriter
ScaledLogLikelihood = [
dim = 183
scpFile=CntkTimitOutput.scp
type=Real # Must be uppercase Real?
]
]
]

Просмотреть файл

@ -0,0 +1,17 @@
HTK=/cygdrive/c/Users/mslaney/Projects/HTK/bin.win32
TIMIT=c:/Users/mslaney/Projects/TIMIT/timit/train
HCopyConfig=HCopyTimit.config
HCopyScript=HCopyTimit.scp
python TimitGetFiles.py $TIMIT
$HTK/hcopy.exe -C $HCopyConfig -S $HCopyScript
CNdir=../../../
rm -f Models/TrainSimple.dnn*
time $CNdir/cn.exe configFile=TrainSimpleTimit.config

Двоичные данные
papers/CNTK-TechReport/figures/ConfusionData1.png Executable file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 16 KiB

Двоичные данные
papers/CNTK-TechReport/figures/ConfusionData100.png Executable file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 14 KiB

Двоичные данные
papers/CNTK-TechReport/figures/SpeechErrorRate.png Executable file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 11 KiB