Adding the speech demo example

2014-11-22 15:37:09 -08:00 · 2014-11-22 15:37:09 -08:00 · efdf17b573
--- a/Demos/Speech/CntkTimit.scp
+++ b/Demos/Speech/CntkTimit.scp
--- a/Demos/Speech/CntkTimitOutput.scp
+++ b/Demos/Speech/CntkTimitOutput.scp
--- a/Demos/Speech/ComputeConfusion.m
+++ b/Demos/Speech/ComputeConfusion.m
@ -0,0 +1,184 @@
+function confusionData = ComputeConfusion(mlfFile)
+% function confusionData = ComputeConfusion(mlfFile)
+% Compute all the confusions for one experiment. Read in the TIMIT MLF file
+% so we know which utterances we have.  For each utterance, read in the
+% CNTK output and compute the confusion matrix.  Sum them all together.
+if nargin < 1
+    mlfFile = 'TimitLabels.mlf';
+end
+
+%% 
+% Parse the Timit MLF file because it tells us the true phonetic labels 
+% for each segment of each utterance.  
+fp = fopen(mlfFile,'r');
+
+segmentLabels = [];
+scale=1e-7;
+numberOfUtterances = 0;
+confusionData = 0;
+while 1
+    theLine = fgets(fp);
+    if isempty(theLine) || theLine(1) == -1
+        break;
+    end
+    if strncmp(theLine, '#!MLF!#', 7)
+        continue;               % Ignore the header
+    end
+    if theLine(1) == '"'        % Look for file name indication
+        numberOfUtterances = numberOfUtterances + 1;
+        fileName = strtok(theLine);
+        fileName = fileName(2:end-1);
+        segmentLabels = [];
+    end
+    if theLine(1) >= '0' && theLine(1) <= '9'
+        % Got a speech segment with times and phoneme label. Parse it.
+        c = textscan(theLine, '%d %d %s ');
+        b = double(c{1}(1)); e = double(c{2}); l = c{3}{1};
+        if isempty(segmentLabels)
+            clear segmentLabels;
+            segmentLabels(1000) = struct('begin', b, 'end', e, 'label', l);
+            segmentCount = 0;
+        end
+        segmentCount = segmentCount + 1;
+        % Add a new entry in the list of segments.
+        segmentLabels(segmentCount) = struct('begin', b*scale, 'end', e*scale, 'label', l);
+    end
+    if theLine(1) == '.'
+        % Found the end of the speech transcription.  Process the new data.
+        c = ComputeConfusionOnce(fileName, segmentLabels(1:segmentCount));
+        confusionData = confusionData + c;
+        segmentLabels = [];
+    end
+end
+fclose(fp);
+
+function Confusions = ComputeConfusionOnce(utteranceName, segmentLabels)
+% function Confusions = ComputeConfusionOnce(utteranceName, labelData)
+% Compute the confusion matrix for one TIMIT utterance.  This routine takes
+% the segment data (from the TIMIT label file) and a feature-file name.  It
+% transforms the feature file into a CNTK output file.  It reads in the
+% CNTK output file, and tabulates a confusion matrix.  We do this one
+% segment at a time, since TIMIT segments are variable length, and the CNTK
+% output is sampled at regular intervals (10ms).
+likelihoodName = strrep(strrep(utteranceName, 'Features/', 'Output/'), ...
+    'fbank_zda', 'log');
+try
+    [likelihood,~] = htkread(likelihoodName);
+catch me
+    fprintf('Can''t read %s using htkread.  Ignoring.\n', likelihoodName);
+    Confusions = 0;
+    return
+end
+
+nStates = 183;              % Preordained.
+frameRate = 100;            % Preordained
+Confusions = zeros(nStates, nStates);
+for i=1:size(segmentLabels, 2)
+    % Go through each entry in the MLF file for one utterance.  Each entry
+    % lists the beginning and each of each speech state.
+    % Compare the true label with the winner of the maximum likelihood from
+    % CNTK.
+    beginIndex = max(1, round(segmentLabels(i).begin*frameRate));
+    endIndex = min(size(likelihood,1), round(segmentLabels(i).end*frameRate));
+    curIndices = beginIndex:endIndex;
+    [~,winners] = max(likelihood(curIndices,:),[], 2);
+    correctLabel = FindLabelNumber(segmentLabels(i).label);
+    for w=winners(:)'                 % increment one at a time
+        Confusions(correctLabel, w) = Confusions(correctLabel, w) + 1;
+    end
+end
+
+function labelNumber = FindLabelNumber(labelName)
+% For each label name, turn the name into an index. The labels are listed,
+% in order, in the TimitStateList file.
+persistent stateList
+if isempty(stateList)
+    stateList = ReadStateList('TimitStateList.txt');
+end
+for labelNumber=1:size(stateList,1)
+    if strcmp(labelName, stateList{labelNumber})
+        return;
+    end
+end
+labelNumber = [];
+
+function stateList = ReadStateList(stateListFile)
+% Read in the state list file. This file contains an ordered list of
+% states, each corresponding to one label (and one output in the CNTK
+% network.)
+fp = fopen(stateListFile);
+nStates = 183;              % Preordained
+stateList = cell(nStates, 1);
+stateIndex = 1;
+while true
+    theLine = fgets(fp);
+    if isempty(theLine) || theLine(1) == -1
+        break;
+    end
+    stateList{stateIndex} = theLine(1:end-1);
+    stateIndex = stateIndex + 1;
+end
+fclose(fp);
+        
+
+function [ DATA, HTKCode ] = htkread( Filename )
+% [ DATA, HTKCode ] = htkread( Filename )
+%
+% Read DATA from possibly compressed HTK format file.
+%
+% Filename (string) - Name of the file to read from
+% DATA (nSamp x NUMCOFS) - Output data array
+% HTKCode - HTKCode describing file contents
+%
+% Compression is handled using the algorithm in 5.10 of the HTKBook.
+% CRC is not implemented.
+%
+% Mark Hasegawa-Johnson
+% July 3, 2002
+% Based on function mfcc_read written by Alexis Bernard
+% Found at: https://raw.githubusercontent.com/ronw/matlab_htk/master/htkread.m
+%
+
+fid=fopen(Filename,'r','b');
+if fid<0,
+    error(sprintf('Unable to read from file %s',Filename));
+end
+
+% Read number of frames
+nSamp = fread(fid,1,'int32');
+
+% Read sampPeriod
+sampPeriod = fread(fid,1,'int32');
+
+% Read sampSize
+sampSize = fread(fid,1,'int16');
+
+% Read HTK Code
+HTKCode = fread(fid,1,'int16');
+
+%%%%%%%%%%%%%%%%%
+% Read the data
+if bitget(HTKCode, 11),
+    DIM=sampSize/2;
+    nSamp = nSamp-4;
+    %disp(sprintf('htkread: Reading %d frames, dim %d, compressed, from %s',nSamp,DIM,Filename)); 
+
+    % Read the compression parameters
+    A = fread(fid,[1 DIM],'float');
+B = fread(fid,[1 DIM],'float');
+    
+    % Read and uncompress the data
+    DATA = fread(fid, [DIM nSamp], 'int16')';
+    DATA = (repmat(B, [nSamp 1]) + DATA) ./ repmat(A, [nSamp 1]);
+
+    
+else
+    DIM=sampSize/4;
+    %disp(sprintf('htkread: Reading %d frames, dim %d, uncompressed, from %s',nSamp,DIM,Filename)); 
+
+    % If not compressed: Read floating point data
+    DATA = fread(fid, [DIM nSamp], 'float')';
+end
+
+fclose(fid);
+
--- a/Demos/Speech/HCopyTimit.config
+++ b/Demos/Speech/HCopyTimit.config
@ -0,0 +1,17 @@
+SOURCEKIND     = WAVEFORM
+SOURCEFORMAT   = NIST
+SAVECOMPRESSED = FALSE
+SAVEWITHCRC    = FALSE
+TARGETKIND     = FBANK_Z_D_A
+TARGETRATE     = 100000
+SOURCERATE     = 625
+WINDOWSIZE     = 250000.0
+PREEMCOEF      = 0.97
+ZMEANSOURCE    = TRUE
+USEHAMMING     = TRUE
+NUMCHANS       = 24
+ENORMALISE     = TRUE
+ESCALE         = 1.0
+LOFREQ         = 64
+HIFREQ         = 8000
+
--- a/Demos/Speech/HCopyTimit.scp
+++ b/Demos/Speech/HCopyTimit.scp
--- a/Demos/Speech/ShowConfusions.m
+++ b/Demos/Speech/ShowConfusions.m
@ -0,0 +1,77 @@
+function ShowConfusions(confusionData, squeeze)
+% function ShowConfusions(confusionData)
+% Average the three-state confusion data into monophone confusions.  Then
+% display the data.  A graphical interface lets you interrogate the data,
+% by moving the mouse, and clicking at various points.  The phonetic labels
+% are shown on the graph.
+confusionSmall = ( ...
+    confusionData(1:3:end,1:3:end) + confusionData(2:3:end, 1:3:end) + confusionData(3:3:end, 1:3:end) + ...
+    confusionData(1:3:end,2:3:end) + confusionData(2:3:end, 2:3:end) + confusionData(3:3:end, 2:3:end) + ...
+    confusionData(1:3:end,3:3:end) + confusionData(2:3:end, 3:3:end) + confusionData(3:3:end, 3:3:end))/9;
+
+if nargin < 2
+    squeeze = 1;
+end
+
+imagesc(confusionSmall .^ squeeze)
+axis ij
+axis square
+ylabel('True Label');
+xlabel('CNTK Prediction');
+
+%%
+stateList = ReadStateList();
+
+h = [];
+fprintf('Select a point with the mouse, type return to end...\n');
+while true
+    [x,y] = ginput(1);
+    if isempty(x) || isempty(y)
+        break;
+    end
+    
+    if ~isempty(h)
+        delete(h);
+        h = [];
+    end
+    try
+        trueLabel = stateList{(round(x)-1)*3+1};
+    catch
+        trueLabel = 'Unknown'; 
+    end
+    try
+        likelihoodLabel = stateList{(round(y)-1)*3+1};
+    catch
+        likelihoodLabel = 'Unknown';
+    end
+    h = text(40, -2, sprintf('%s -> %s', trueLabel, likelihoodLabel));
+    % h = text(40, -2, sprintf('%g -> %g', x, y));
+end
+
+function stateList = ReadStateList(stateListFile)
+% Read in the state list file. This file contains an ordered list of
+% states, each corresponding to one label (and one output in the CNTK
+% network.)
+if nargin < 1
+    stateListFile = 'TimitStateList.txt';
+end
+% Read in the state list file.
+fp = fopen(stateListFile);
+nStates = 183;              % Preordained
+stateList = cell(nStates, 1);
+stateIndex = 1;
+while true
+    theLine = fgets(fp);
+    if isempty(theLine) || theLine(1) == -1
+        break;
+    end
+    f = find(theLine == '_');
+    if ~isempty(f)
+        label = theLine(1:f(1)-1);
+    else
+        label = theLine(1:end-1);
+    end
+    stateList{stateIndex} = label;
+    stateIndex = stateIndex + 1;
+end
+fclose(fp);
--- a/Demos/Speech/TimitGetFiles.py
+++ b/Demos/Speech/TimitGetFiles.py
@ -0,0 +1,66 @@
+#!python
+# 
+# Syntax: python TimitGetFiles.py TIMIT_base_directory_path
+#
+# Simple script to take the list of valid TIMIT utterances, and turn them into
+# the script files needed for HCopy and CNTK.
+#
+# This program reads the TimitSubjectList.txt file, which is included in the
+# demo distribution.  It then creates three files:
+#	HCopyTimit.scp - script that converts MFCC to audio features
+#	CntkTimit.scp - list of files that are read by CNTK for training
+#	CntkTimitOutput.scp - list of output files for CNTK (likelhihood scores)
+# All these files can be edited by hand if you don't want to run this python
+# script.
+
+import os, sys
+
+# Set up the base directory name.  This will be prepended to each file name
+# so hcopy knows where to find the wave files.
+if len(sys.argv) > 1:
+	baseDir = sys.argv[1]
+	if os.path.isdir(baseDir) == False:
+		print "Can't find TIMIT base directory: " + baseDir
+		sys.exit(1)
+else:
+	print "Syntax: " + sys.argv[0] + " TIMIT_base_directory_path"
+	sys.exit(1)
+
+if not baseDir.endswith('\\') and not baseDir.endswith('/'):
+	baseDir += '/'
+
+hcopyScript = 'HCopyTimit.scp'
+cnScript = 'CntkTimit.scp'
+cnOutputScript = 'CntkTimitOutput.scp'
+
+hcopyScriptFp = open(hcopyScript, 'w')
+cnScriptFp = open(cnScript, 'w')
+cnOutputScriptFp = open(cnOutputScript, 'w')
+fileCount = 0
+
+fileList = 'TimitSubjectList.txt'
+fileListFp = open(fileList)
+
+if !hcopyScript or !cnScriptFp or !cnOutputScriptFp or !fileListFp:
+	print "Can't open the necessary output files.
+	sys.exit(0)
+
+for origFile in fileListFp:
+	origFile = origFile.strip()
+	fullFile = baseDir + origFile
+	
+	# Flatten the output structure.  Replace / with -
+	outFile = origFile.replace('/', '-').replace('\\', '-')
+	featFile = 'Features/train-' + outFile
+	hcopyScriptFp.write(fullFile+'.nst ' + featFile+'.fbank_zda\n')
+	cnScriptFp.write(featFile+'.fbank_zda\n')
+	cnOutputScriptFp.write('Output/train-'+outFile+'.log\n')
+	fileCount += 1
+	if fileCount > 100000000:		# Debugging
+		break
+
+fileListFp.close()
+hcopyScriptFp.close()
+cnScriptFp.close()
+cnOutputScriptFp.close()
+
--- a/Demos/Speech/TimitLabels.mlf
+++ b/Demos/Speech/TimitLabels.mlf
--- a/Demos/Speech/TimitStateList.txt
+++ b/Demos/Speech/TimitStateList.txt
@ -0,0 +1,183 @@
+aa_s2
+aa_s3
+aa_s4
+ae_s2
+ae_s3
+ae_s4
+ah_s2
+ah_s3
+ah_s4
+ao_s2
+ao_s3
+ao_s4
+aw_s2
+aw_s3
+aw_s4
+ax_s2
+ax_s3
+ax_s4
+axh_s2
+axh_s3
+axh_s4
+axr_s2
+axr_s3
+axr_s4
+ay_s2
+ay_s3
+ay_s4
+b_s2
+b_s3
+b_s4
+bcl_s2
+bcl_s3
+bcl_s4
+ch_s2
+ch_s3
+ch_s4
+d_s2
+d_s3
+d_s4
+dcl_s2
+dcl_s3
+dcl_s4
+dh_s2
+dh_s3
+dh_s4
+dx_s2
+dx_s3
+dx_s4
+eh_s2
+eh_s3
+eh_s4
+el_s2
+el_s3
+el_s4
+em_s2
+em_s3
+em_s4
+en_s2
+en_s3
+en_s4
+eng_s2
+eng_s3
+eng_s4
+epi_s2
+epi_s3
+epi_s4
+er_s2
+er_s3
+er_s4
+ey_s2
+ey_s3
+ey_s4
+f_s2
+f_s3
+f_s4
+g_s2
+g_s3
+g_s4
+gcl_s2
+gcl_s3
+gcl_s4
+h#_s2
+h#_s3
+h#_s4
+hh_s2
+hh_s3
+hh_s4
+hv_s2
+hv_s3
+hv_s4
+ih_s2
+ih_s3
+ih_s4
+ix_s2
+ix_s3
+ix_s4
+iy_s2
+iy_s3
+iy_s4
+jh_s2
+jh_s3
+jh_s4
+k_s2
+k_s3
+k_s4
+kcl_s2
+kcl_s3
+kcl_s4
+l_s2
+l_s3
+l_s4
+m_s2
+m_s3
+m_s4
+n_s2
+n_s3
+n_s4
+ng_s2
+ng_s3
+ng_s4
+nx_s2
+nx_s3
+nx_s4
+ow_s2
+ow_s3
+ow_s4
+oy_s2
+oy_s3
+oy_s4
+p_s2
+p_s3
+p_s4
+pau_s2
+pau_s3
+pau_s4
+pcl_s2
+pcl_s3
+pcl_s4
+q_s2
+q_s3
+q_s4
+r_s2
+r_s3
+r_s4
+s_s2
+s_s3
+s_s4
+sh_s2
+sh_s3
+sh_s4
+t_s2
+t_s3
+t_s4
+tcl_s2
+tcl_s3
+tcl_s4
+th_s2
+th_s3
+th_s4
+uh_s2
+uh_s3
+uh_s4
+uw_s2
+uw_s3
+uw_s4
+ux_s2
+ux_s3
+ux_s4
+v_s2
+v_s3
+v_s4
+w_s2
+w_s3
+w_s4
+y_s2
+y_s3
+y_s4
+z_s2
+z_s3
+z_s4
+zh_s2
+zh_s3
+zh_s4
--- a/Demos/Speech/TimitSubjectList.txt
+++ b/Demos/Speech/TimitSubjectList.txt
--- a/Demos/Speech/TrainSimpleTimit.config
+++ b/Demos/Speech/TrainSimpleTimit.config
@ -0,0 +1,156 @@
+command=TimitTrainSimple:TimitTestSimple:TimitWriteSimple
+# command=TimitWriteSimple
+
+# deviceId=-1 for CPU, >=0 for GPU devices
+DeviceNumber=0
+stderr=Demo
+
+precision=float
+
+#######################################
+#  TRAINING CONFIG (Simple, Fixed LR) #
+#######################################
+
+TimitTrainSimple=[
+    action=train
+
+    modelPath=Models/TrainSimple.dnn
+
+    # deviceId=-1 for CPU, >=0 for GPU devices 
+    deviceId=$DeviceNumber$
+
+    traceLevel=1
+
+    # Notation xxx:yyy*n:zzz is equivalent to xxx, then yyy repeated n times,
+    #  then zzz
+    # example: 10:20*3:5 is equivalent to 10:20:20:20:5
+    SimpleNetworkBuilder=[
+        layerSizes=792:512*3:183
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+
+    SGD=[
+	# epochSize=0 means epochSize is the size of the training set
+        epochSize=0 
+        minibatchSize=256:1024
+        learningRatesPerMB=0.8:3.2*14:0.08
+        momentumPerMB=0.9
+        dropoutRate=0.0
+        # maxEpochs=25		# Full experiment
+        maxEpochs=1		# Small experiment for debugging
+    ]
+       
+    # Parameter values for the reader
+    reader=[
+      # reader to use
+      readerType=HTKMLFReader
+
+#     readMethod=blockRandomize
+      readMethod=rollingWindow
+
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=1   
+
+      features=[
+	  dim=792
+	  scpFile=CntkTimit.scp
+      ]
+
+      labels=[
+	mlfFile=TimitLabels.mlf
+        labelDim=183
+	labelMappingFile=TimitStateList.txt
+      ]
+    ]
+]
+
+#######################################
+#  TESTING CONFIG (Simple)            #
+#######################################
+
+TimitTestSimple=[
+    action=test
+
+    modelPath=Models/TrainSimple.dnn
+
+    # deviceId=-1 for CPU, >=0 for GPU devices 
+    deviceId=$DeviceNumber$
+
+    traceLevel=1
+
+	# epochSize=0 means epochSize is the size of the training set
+        epochSize=0 
+        minibatchSize=256:1024
+       
+    # Parameter values for the reader
+    reader=[
+      # reader to use
+      readerType=HTKMLFReader
+
+      readMethod=rollingWindow
+
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=1   
+
+	# numMBsToShowResult=10
+	# evalNodeNames
+
+      features=[
+	  dim=792
+	  scpFile=CntkTimit.scp
+      ]
+
+      labels=[
+	mlfFile=TimitLabels.mlf
+        labelDim=183
+	labelMappingFile=TimitStateList.txt
+      ]
+    ]
+
+]
+
+#######################################
+#  OUTPUT RESUTLS (Simple)            #
+#######################################
+
+TimitWriteSimple=[
+    action=write
+
+    modelPath=Models/TrainSimple.dnn
+
+    # deviceId=-1 for CPU, >=0 for GPU devices 
+    deviceId=$DeviceNumber$
+
+    outputNodeNames=ScaledLogLikelihood
+    traceLevel=1
+
+    # Parameter values for the reader
+    reader=[
+      # reader to use
+      readerType=HTKMLFReader
+
+      features=[
+	  dim=792
+	  scpFile=CntkTimit.scp
+      ]
+
+    ]
+    writer=[
+      wrecords=50000 #Number of records to allocate space for in file
+      writerType=HTKMLFWriter
+      ScaledLogLikelihood = [
+          dim = 183
+	  scpFile=CntkTimitOutput.scp
+           type=Real			# Must be uppercase Real?
+	]
+    ]
+]
+
--- a/Demos/Speech/TrainSimpleTimit.sh
+++ b/Demos/Speech/TrainSimpleTimit.sh
@ -0,0 +1,17 @@
+
+HTK=/cygdrive/c/Users/mslaney/Projects/HTK/bin.win32
+TIMIT=c:/Users/mslaney/Projects/TIMIT/timit/train
+
+HCopyConfig=HCopyTimit.config
+HCopyScript=HCopyTimit.scp
+
+python TimitGetFiles.py $TIMIT
+
+$HTK/hcopy.exe -C $HCopyConfig -S $HCopyScript
+
+
+CNdir=../../../
+rm -f Models/TrainSimple.dnn*
+
+time $CNdir/cn.exe configFile=TrainSimpleTimit.config
+
--- a/papers/CNTK-TechReport/figures/ConfusionData1.png
+++ b/papers/CNTK-TechReport/figures/ConfusionData1.png
--- a/papers/CNTK-TechReport/figures/ConfusionData100.png
+++ b/papers/CNTK-TechReport/figures/ConfusionData100.png
--- a/papers/CNTK-TechReport/figures/SpeechErrorRate.png
+++ b/papers/CNTK-TechReport/figures/SpeechErrorRate.png