Adapt to decoder API changes and new scorer packaging

This commit is contained in:
Reuben Morais 2020-01-20 16:20:22 +01:00
Родитель b4e3853064
Коммит 4b97ac41d0
15 изменённых файлов: 88 добавлений и 135 удалений

Просмотреть файл

@ -21,41 +21,37 @@ sudo apt-get install ffmpeg
Here is an example for a local audio file:
```bash
node ./index.js --audio <AUDIO_FILE> \
--model $HOME/models/output_graph.pbmm \
--model $HOME/models/output_graph.pbmm
```
Here is an example for a remote RTMP-Stream:
```bash
node ./index.js --audio rtmp://<IP>:1935/live/teststream \
--model $HOME/models/output_graph.pbmm \
--model $HOME/models/output_graph.pbmm
```
## Examples
Real time streaming inference with DeepSpeech's example audio ([audio-0.4.1.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz)).
```bash
node ./index.js --audio $HOME/audio/2830-3980-0043.wav \
--lm $HOME/models/lm.binary \
--trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \
--scorer $HOME/models/kenlm.scorer \
--model $HOME/models/output_graph.pbmm
```
```bash
node ./index.js --audio $HOME/audio/4507-16021-0012.wav \
--lm $HOME/models/lm.binary \
--trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \
--scorer $HOME/models/kenlm.scorer \
--model $HOME/models/output_graph.pbmm
```
```bash
node ./index.js --audio $HOME/audio/8455-210777-0068.wav \
--lm $HOME/models/lm.binary \
--trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \
--scorer $HOME/models/kenlm.scorer \
--model $HOME/models/output_graph.pbmm
```
Real time streaming inference in combination with a RTMP server.
```bash
node ./index.js --audio rtmp://<HOST>/<APP>/<KEY> \
--lm $HOME/models/lm.binary \
--trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \
--scorer $HOME/models/kenlm.scorer \
--model $HOME/models/output_graph.pbmm
```
## Notes

Просмотреть файл

@ -11,12 +11,6 @@ const { spawn } = require('child_process');
// Beam width used in the CTC decoder when building candidate transcriptions
const BEAM_WIDTH = 500;
// The alpha hyperparameter of the CTC decoder. Language Model weight
const LM_ALPHA = 0.75;
// The beta hyperparameter of the CTC decoder. Word insertion bonus.
const LM_BETA = 1.85;
let VersionAction = function VersionAction(options) {
options = options || {};
options.nargs = 0;
@ -32,8 +26,7 @@ VersionAction.prototype.call = function(parser) {
let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
parser.addArgument(['--scorer'], {help: 'Path to the scorer file', nargs: '?'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
let args = parser.parseArgs();
@ -48,12 +41,12 @@ let model = new Ds.Model(args['model'], BEAM_WIDTH);
const model_load_end = process.hrtime(model_load_start);
console.error('Loaded model in %ds.', totalTime(model_load_end));
if (args['lm'] && args['trie']) {
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
const lm_load_start = process.hrtime();
model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
const lm_load_end = process.hrtime(lm_load_start);
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
if (args['scorer']) {
console.error('Loading scorer from file %s', args['scorer']);
const scorer_load_start = process.hrtime();
model.enableExternalScorer(args['scorer']);
const scorer_load_end = process.hrtime(scorer_load_start);
console.error('Loaded scorer in %ds.', totalTime(scorer_load_end));
}
// Default is 16kHz
@ -99,7 +92,7 @@ let sctx = model.createStream();
function finishStream() {
const model_load_start = process.hrtime();
console.error('Running inference.');
console.log('Transcription: ', model.finishStream(sctx));
console.log('Transcription: ', sctx.finishStream());
const model_load_end = process.hrtime(model_load_start);
console.error('Inference took %ds for %ds audio file.', totalTime(model_load_end), audioLength.toPrecision(4));
audioLength = 0;
@ -112,7 +105,7 @@ function intermediateDecode() {
function feedAudioContent(chunk) {
audioLength += (chunk.length / 2) * ( 1 / AUDIO_SAMPLE_RATE);
model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
sctx.feedAudioContent(chunk.slice(0, chunk.length / 2));
}
function processVad(data) {

Просмотреть файл

@ -11,17 +11,14 @@ pushd ${THIS}
npm install
node ./index.js --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
--lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \
--scorer $HOME/DeepSpeech/models/kenlm.scorer \
--model $HOME/DeepSpeech/models/output_graph.pbmm
node ./index.js --audio $HOME/DeepSpeech/audio/4507-16021-0012.wav \
--lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \
--scorer $HOME/DeepSpeech/models/kenlm.scorer \
--model $HOME/DeepSpeech/models/output_graph.pbmm
node ./index.js --audio $HOME/DeepSpeech/audio/8455-210777-0068.wav \
--lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \
--scorer $HOME/DeepSpeech/models/kenlm.scorer \
--model $HOME/DeepSpeech/models/output_graph.pbmm
popd

Просмотреть файл

@ -29,9 +29,8 @@ Usage
.. code-block::
usage: mic_vad_streaming.py [-h] [-v VAD_AGGRESSIVENESS] [--nospinner]
[-w SAVEWAV] -m MODEL [-l LM]
[-t TRIE] [-nf N_FEATURES] [-nc N_CONTEXT]
[-la LM_ALPHA] [-lb LM_BETA]
[-w SAVEWAV] -m MODEL [-s SCORER]
[-nf N_FEATURES] [-nc N_CONTEXT]
[-bw BEAM_WIDTH]
Stream from microphone to DeepSpeech using VAD
@ -49,21 +48,13 @@ Usage
Path to the model (protocol buffer binary file, or
entire directory containing all standard-named files
for model)
-l LM, --lm LM Path to the language model binary file. Default:
lm.binary
-t TRIE, --trie TRIE Path to the language model trie file created with
native_client/generate_trie. Default: trie
-s SCORER, --scorer SCORER
Path to the external scorer file. Default: kenlm.scorer
-nf N_FEATURES, --n_features N_FEATURES
Number of MFCC features to use. Default: 26
-nc N_CONTEXT, --n_context N_CONTEXT
Size of the context window used for producing
timesteps in the input vector. Default: 9
-la LM_ALPHA, --lm_alpha LM_ALPHA
The alpha hyperparameter of the CTC decoder. Language
Model weight. Default: 0.75
-lb LM_BETA, --lm_beta LM_BETA
The beta hyperparameter of the CTC decoder. Word insertion
bonus. Default: 1.85
-bw BEAM_WIDTH, --beam_width BEAM_WIDTH
Beam width used in the CTC decoder when building
candidate transcriptions. Default: 500

Просмотреть файл

@ -156,16 +156,14 @@ def main(ARGS):
if os.path.isdir(ARGS.model):
model_dir = ARGS.model
ARGS.model = os.path.join(model_dir, 'output_graph.pb')
ARGS.lm = os.path.join(model_dir, ARGS.lm)
ARGS.trie = os.path.join(model_dir, ARGS.trie)
ARGS.scorer = os.path.join(model_dir, ARGS.scorer)
print('Initializing model...')
logging.info("ARGS.model: %s", ARGS.model)
model = deepspeech.Model(ARGS.model, ARGS.beam_width)
if ARGS.lm and ARGS.trie:
logging.info("ARGS.lm: %s", ARGS.lm)
logging.info("ARGS.trie: %s", ARGS.trie)
model.enableDecoderWithLM(ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta)
if ARGS.scorer:
logging.info("ARGS.scorer: %s", ARGS.scorer)
model.enableExternalScorer(ARGS.scorer)
# Start audio with VAD
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
@ -185,7 +183,7 @@ def main(ARGS):
if frame is not None:
if spinner: spinner.start()
logging.debug("streaming frame")
model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16))
stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
if ARGS.savewav: wav_data.extend(frame)
else:
if spinner: spinner.stop()
@ -193,15 +191,13 @@ def main(ARGS):
if ARGS.savewav:
vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
wav_data = bytearray()
text = model.finishStream(stream_context)
text = stream_context.finishStream()
print("Recognized: %s" % text)
stream_context = model.createStream()
if __name__ == '__main__':
BEAM_WIDTH = 500
DEFAULT_SAMPLE_RATE = 16000
LM_ALPHA = 0.75
LM_BETA = 1.85
import argparse
parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
@ -217,18 +213,12 @@ if __name__ == '__main__':
parser.add_argument('-m', '--model', required=True,
help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
parser.add_argument('-l', '--lm', default='lm.binary',
help="Path to the language model binary file. Default: lm.binary")
parser.add_argument('-t', '--trie', default='trie',
help="Path to the language model trie file created with native_client/generate_trie. Default: trie")
parser.add_argument('-s', '--scorer', default='kenlm.scorer',
help="Path to the external scorer file. Default: kenlm.scorer")
parser.add_argument('-d', '--device', type=int, default=None,
help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
parser.add_argument('-la', '--lm_alpha', type=float, default=LM_ALPHA,
help=f"The alpha hyperparameter of the CTC decoder. Language Model weight. Default: {LM_ALPHA}")
parser.add_argument('-lb', '--lm_beta', type=float, default=LM_BETA,
help=f"The beta hyperparameter of the CTC decoder. Word insertion bonus. Default: {LM_BETA}")
parser.add_argument('-bw', '--beam_width', type=int, default=BEAM_WIDTH,
help=f"Beam width used in the CTC decoder when building candidate transcriptions. Default: {BEAM_WIDTH}")

Просмотреть файл

@ -8,13 +8,12 @@ pushd ${THIS}
source ../tests.sh
pip install --user $(get_python_wheel_url "$1")
pip install --user -r requirements.txt
pip install --user -r <(grep -v deepspeech requirements.txt)
pulseaudio &
python mic_vad_streaming.py \
--model $HOME/DeepSpeech/models/output_graph.pbmm \
--lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \
--scorer $HOME/DeepSpeech/models/kenlm.scorer \
--file $HOME/DeepSpeech/audio/2830-3980-0043.wav
popd

Просмотреть файл

@ -52,8 +52,8 @@
Margin="95,69,0,0"
HorizontalAlignment="Left"
VerticalAlignment="Top"
Command="{Binding EnableLanguageModelCommand}"
Content="Enable LM" />
Command="{Binding EnableExternalScorerCommand}"
Content="Enable external scorer" />
<Button
Width="75"
Height="25"

Просмотреть файл

@ -24,17 +24,16 @@ namespace DeepSpeech.WPF.ViewModels
{
#region Constants
private const int SampleRate = 16000;
private const string LMPath = "lm.binary";
private const string TriePath = "trie";
private const string ScorerPath = "kenlm.scorer";
#endregion
private readonly IDeepSpeech _sttClient;
#region Commands
/// <summary>
/// Gets or sets the command that enables the language model.
/// Gets or sets the command that enables the external scorer.
/// </summary>
public IAsyncCommand EnableLanguageModelCommand { get; private set; }
public IAsyncCommand EnableExternalScorerCommand { get; private set; }
/// <summary>
/// Gets or sets the command that runs inference using an audio file.
@ -146,15 +145,15 @@ namespace DeepSpeech.WPF.ViewModels
set => SetProperty(ref _statusMessage, value);
}
private bool _languageModelEnabled;
private bool _externalScorerEnabled;
/// <summary>
/// Gets or sets the language model status.
/// Gets or sets the external scorer status.
/// </summary>
private bool LanguageModelEnabled
private bool ExternalScorerEnabled
{
get => _languageModelEnabled;
set => SetProperty(ref _languageModelEnabled, value,
onChanged: () => ((AsyncCommand)EnableLanguageModelCommand).RaiseCanExecuteChanged());
get => _externalScorerEnabled;
set => SetProperty(ref _externalScorerEnabled, value,
onChanged: () => ((AsyncCommand)EnableExternalScorerCommand).RaiseCanExecuteChanged());
}
private bool _isRunningInference;
@ -205,8 +204,8 @@ namespace DeepSpeech.WPF.ViewModels
{
_sttClient = sttClient;
EnableLanguageModelCommand = new AsyncCommand(()=>EnableLanguageModelAsync(LMPath,TriePath),
_ => !LanguageModelEnabled);
EnableExternalScorerCommand = new AsyncCommand(()=>EnableExternalScorerAsync(ScorerPath),
_ => !ExternalScorerEnabled);
InferenceFromFileCommand = new AsyncCommand(ExecuteInferenceFromFileAsync,
_ => !IsRunningInference);
@ -322,21 +321,18 @@ namespace DeepSpeech.WPF.ViewModels
}
/// <summary>
/// Enables the language model.
/// Enables the external scorer.
/// </summary>
/// <param name="lmPath">Language model path.</param>
/// <param name="triePath">Trie path.</param>
/// <param name="scorerPath">External scorer path.</param>
/// <returns>A Task to await.</returns>
public async Task EnableLanguageModelAsync(string lmPath, string triePath)
public async Task EnableExternalScorerAsync(string scorerPath)
{
try
{
StatusMessage = "Loading language model...";
const float LM_ALPHA = 0.75f;
const float LM_BETA = 1.85f;
await Task.Run(() => _sttClient.EnableDecoderWithLM(LMPath, TriePath, LM_ALPHA, LM_BETA));
LanguageModelEnabled = true;
StatusMessage = "Language model loaded.";
StatusMessage = "Loading external scorer...";
await Task.Run(() => _sttClient.EnableExternalScorer(ScorerPath));
ExternalScorerEnabled = true;
StatusMessage = "External scorer loaded.";
}
catch (Exception ex)
{

Просмотреть файл

@ -11,8 +11,7 @@ Edit references to models path if necessary:
```
let modelPath = './models/output_graph.pbmm';
let lmPath = './models/lm.binary';
let triePath = './models/trie';
let scorerPath = './models/kenlm.scorer';
```
Install Sox (for .wav file loading):

Просмотреть файл

@ -12,12 +12,9 @@ let model = new DeepSpeech.Model(modelPath, BEAM_WIDTH);
let desiredSampleRate = model.sampleRate();
const LM_ALPHA = 0.75;
const LM_BETA = 1.85;
let lmPath = './models/lm.binary';
let triePath = './models/trie';
let scorerPath = './models/kenlm.scorer';
model.enableDecoderWithLM(lmPath, triePath, LM_ALPHA, LM_BETA);
model.enableExternalScorer(scorerPath);
let audioFile = process.argv[2] || './audio/2830-3980-0043.wav';

Просмотреть файл

@ -18,7 +18,7 @@ def main(args):
parser.add_argument('--audio', required=False,
help='Path to the audio file to run (WAV format)')
parser.add_argument('--model', required=True,
help='Path to directory that contains all model files (output_graph, lm and trie)')
help='Path to directory that contains all model files (output_graph and scorer)')
parser.add_argument('--stream', required=False, action='store_true',
help='To use deepspeech streaming interface')
args = parser.parse_args()
@ -34,13 +34,13 @@ def main(args):
dirName = os.path.expanduser(args.model)
# Resolve all the paths of model files
output_graph, lm, trie = wavTranscriber.resolve_models(dirName)
output_graph, scorer = wavTranscriber.resolve_models(dirName)
# Load output_graph, alpahbet, lm and trie
model_retval = wavTranscriber.load_model(output_graph, lm, trie)
# Load output_graph, alpahbet and scorer
model_retval = wavTranscriber.load_model(output_graph, scorer)
if args.audio is not None:
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'Scorer Load Time(s)']
print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
inference_time = 0.0
@ -81,9 +81,9 @@ def main(args):
try:
while True:
data = subproc.stdout.read(512)
model_retval[0].feedAudioContent(sctx, np.frombuffer(data, np.int16))
sctx.feedAudioContent(np.frombuffer(data, np.int16))
except KeyboardInterrupt:
print('Transcription: ', model_retval[0].finishStream(sctx))
print('Transcription: ', sctx.finishStream())
subproc.terminate()
subproc.wait()

Просмотреть файл

@ -109,7 +109,7 @@ class App(QMainWindow):
self.microphone = QRadioButton("Microphone")
self.fileUpload = QRadioButton("File Upload")
self.browseBox = QLineEdit(self, placeholderText="Wave File, Mono @ 16 kHz, 16bit Little-Endian")
self.modelsBox = QLineEdit(self, placeholderText="Directory path for output_graph, lm & trie")
self.modelsBox = QLineEdit(self, placeholderText="Directory path for output_graph and scorer")
self.textboxTranscript = QPlainTextEdit(self, placeholderText="Transcription")
self.browseButton = QPushButton('Browse', self)
self.browseButton.setToolTip('Select a wav file')
@ -238,9 +238,9 @@ class App(QMainWindow):
def modelResult(self, dirName):
# Fetch and Resolve all the paths of model files
output_graph, lm, trie = wavTranscriber.resolve_models(dirName)
# Load output_graph, alpahbet, lm and trie
self.model = wavTranscriber.load_model(output_graph, lm, trie)
output_graph, scorer = wavTranscriber.resolve_models(dirName)
# Load output_graph, alphabet and scorer
self.model = wavTranscriber.load_model(output_graph, scorer)
def modelFinish(self):
# self.timer.stop()
@ -316,9 +316,9 @@ class App(QMainWindow):
logging.debug("Recording from your microphone")
while (not self.openMicrophone.isChecked()):
data = context[1].stdout.read(512)
context[2].feedAudioContent(context[0], np.frombuffer(data, np.int16))
context[0].feedAudioContent(np.frombuffer(data, np.int16))
else:
transcript = context[2].finishStream(context[0])
transcript = context[0].finishStream()
context[1].terminate()
context[1].wait()
self.show()
@ -367,7 +367,7 @@ class App(QMainWindow):
# Format pretty, extract filename from the full file path
filename, ext = os.path.split(os.path.basename(waveFile))
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'Scorer Load Time(s)']
logging.debug("************************************************************************************************************")
logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, self.model[1], self.model[2]))

Просмотреть файл

@ -8,7 +8,7 @@ pushd ${THIS}
source ../tests.sh
pip install --user $(get_python_wheel_url "$1")
pip install --user -r requirements.txt
pip install --user -r <(grep -v deepspeech requirements.txt)
python audioTranscript_cmd.py \
--audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \

Просмотреть файл

@ -8,28 +8,25 @@ from timeit import default_timer as timer
'''
Load the pre-trained model into the memory
@param models: Output Grapgh Protocol Buffer file
@param lm: Language model file
@param trie: Trie file
@param scorer: Scorer file
@Retval
Returns a list [DeepSpeech Object, Model Load Time, LM Load Time]
Returns a list [DeepSpeech Object, Model Load Time, Scorer Load Time]
'''
def load_model(models, lm, trie):
def load_model(models, scorer):
BEAM_WIDTH = 500
LM_ALPHA = 0.75
LM_BETA = 1.85
model_load_start = timer()
ds = Model(models, BEAM_WIDTH)
model_load_end = timer() - model_load_start
logging.debug("Loaded model in %0.3fs." % (model_load_end))
lm_load_start = timer()
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
lm_load_end = timer() - lm_load_start
logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))
scorer_load_start = timer()
ds.enableExternalScorer(scorer)
scorer_load_end = timer() - scorer_load_start
logging.debug('Loaded external scorer in %0.3fs.' % (scorer_load_end))
return [ds, model_load_end, lm_load_end]
return [ds, model_load_end, scorer_load_end]
'''
Run Inference on input audio file
@ -60,18 +57,16 @@ Resolve directory path for the models and fetch each of them.
@param dirName: Path to the directory containing pre-trained models
@Retval:
Retunns a tuple containing each of the model files (pb, lm and trie)
Retunns a tuple containing each of the model files (pb, scorer)
'''
def resolve_models(dirName):
pb = glob.glob(dirName + "/*.pb")[0]
logging.debug("Found Model: %s" % pb)
lm = glob.glob(dirName + "/lm.binary")[0]
trie = glob.glob(dirName + "/trie")[0]
logging.debug("Found Language Model: %s" % lm)
logging.debug("Found Trie: %s" % trie)
scorer = glob.glob(dirName + "/kenlm.scorer")[0]
logging.debug("Found scorer: %s" % scorer)
return pb, lm, trie
return pb, scorer
'''
Generate VAD segments. Filters out non-voiced audio frames.

Просмотреть файл

@ -23,7 +23,7 @@ A list of distributions where the package is available can be found at: https://
#### 0.1 Download Deepspeech
Either clone from git via git clone, or Download a version from the release page
For the next steps we assume you have extracted the files to ~/Deepspeech
For the next steps we assume you have extracted the files to `~/Deepspeech`
#### 0.2 Setup your environment
@ -64,7 +64,7 @@ Set the aggressiveness mode, to an integer between 0 and 3.
$ python3 audioTranscript_cmd.py --aggressive 1 --audio ./audio/guido-van-rossum.wav --model ./models/0.4.1/
Filename Duration(s) Inference Time(s) Model Load Time(s) LM Load Time(s)
Filename Duration(s) Inference Time(s) Model Load Time(s) Scorer Load Time(s)
sample_rec.wav 13.710 20.797 5.593 17.742
```