Adapt to decoder API changes and new scorer packaging
This commit is contained in:
Родитель
b4e3853064
Коммит
4b97ac41d0
|
@ -21,41 +21,37 @@ sudo apt-get install ffmpeg
|
|||
Here is an example for a local audio file:
|
||||
```bash
|
||||
node ./index.js --audio <AUDIO_FILE> \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
--model $HOME/models/output_graph.pbmm
|
||||
```
|
||||
|
||||
Here is an example for a remote RTMP-Stream:
|
||||
```bash
|
||||
node ./index.js --audio rtmp://<IP>:1935/live/teststream \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
--model $HOME/models/output_graph.pbmm
|
||||
```
|
||||
|
||||
## Examples
|
||||
Real time streaming inference with DeepSpeech's example audio ([audio-0.4.1.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz)).
|
||||
```bash
|
||||
node ./index.js --audio $HOME/audio/2830-3980-0043.wav \
|
||||
--lm $HOME/models/lm.binary \
|
||||
--trie $HOME/models/trie \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
--scorer $HOME/models/kenlm.scorer \
|
||||
--model $HOME/models/output_graph.pbmm
|
||||
```
|
||||
```bash
|
||||
node ./index.js --audio $HOME/audio/4507-16021-0012.wav \
|
||||
--lm $HOME/models/lm.binary \
|
||||
--trie $HOME/models/trie \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
--scorer $HOME/models/kenlm.scorer \
|
||||
--model $HOME/models/output_graph.pbmm
|
||||
```
|
||||
```bash
|
||||
node ./index.js --audio $HOME/audio/8455-210777-0068.wav \
|
||||
--lm $HOME/models/lm.binary \
|
||||
--trie $HOME/models/trie \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
--scorer $HOME/models/kenlm.scorer \
|
||||
--model $HOME/models/output_graph.pbmm
|
||||
```
|
||||
Real time streaming inference in combination with a RTMP server.
|
||||
```bash
|
||||
node ./index.js --audio rtmp://<HOST>/<APP>/<KEY> \
|
||||
--lm $HOME/models/lm.binary \
|
||||
--trie $HOME/models/trie \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
--scorer $HOME/models/kenlm.scorer \
|
||||
--model $HOME/models/output_graph.pbmm
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
|
|
@ -11,12 +11,6 @@ const { spawn } = require('child_process');
|
|||
// Beam width used in the CTC decoder when building candidate transcriptions
|
||||
const BEAM_WIDTH = 500;
|
||||
|
||||
// The alpha hyperparameter of the CTC decoder. Language Model weight
|
||||
const LM_ALPHA = 0.75;
|
||||
|
||||
// The beta hyperparameter of the CTC decoder. Word insertion bonus.
|
||||
const LM_BETA = 1.85;
|
||||
|
||||
let VersionAction = function VersionAction(options) {
|
||||
options = options || {};
|
||||
options.nargs = 0;
|
||||
|
@ -32,8 +26,7 @@ VersionAction.prototype.call = function(parser) {
|
|||
|
||||
let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
|
||||
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
|
||||
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
|
||||
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
|
||||
parser.addArgument(['--scorer'], {help: 'Path to the scorer file', nargs: '?'});
|
||||
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
|
||||
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
|
||||
let args = parser.parseArgs();
|
||||
|
@ -48,12 +41,12 @@ let model = new Ds.Model(args['model'], BEAM_WIDTH);
|
|||
const model_load_end = process.hrtime(model_load_start);
|
||||
console.error('Loaded model in %ds.', totalTime(model_load_end));
|
||||
|
||||
if (args['lm'] && args['trie']) {
|
||||
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
|
||||
const lm_load_start = process.hrtime();
|
||||
model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
|
||||
const lm_load_end = process.hrtime(lm_load_start);
|
||||
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
|
||||
if (args['scorer']) {
|
||||
console.error('Loading scorer from file %s', args['scorer']);
|
||||
const scorer_load_start = process.hrtime();
|
||||
model.enableExternalScorer(args['scorer']);
|
||||
const scorer_load_end = process.hrtime(scorer_load_start);
|
||||
console.error('Loaded scorer in %ds.', totalTime(scorer_load_end));
|
||||
}
|
||||
|
||||
// Default is 16kHz
|
||||
|
@ -99,7 +92,7 @@ let sctx = model.createStream();
|
|||
function finishStream() {
|
||||
const model_load_start = process.hrtime();
|
||||
console.error('Running inference.');
|
||||
console.log('Transcription: ', model.finishStream(sctx));
|
||||
console.log('Transcription: ', sctx.finishStream());
|
||||
const model_load_end = process.hrtime(model_load_start);
|
||||
console.error('Inference took %ds for %ds audio file.', totalTime(model_load_end), audioLength.toPrecision(4));
|
||||
audioLength = 0;
|
||||
|
@ -112,7 +105,7 @@ function intermediateDecode() {
|
|||
|
||||
function feedAudioContent(chunk) {
|
||||
audioLength += (chunk.length / 2) * ( 1 / AUDIO_SAMPLE_RATE);
|
||||
model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
|
||||
sctx.feedAudioContent(chunk.slice(0, chunk.length / 2));
|
||||
}
|
||||
|
||||
function processVad(data) {
|
||||
|
|
|
@ -11,17 +11,14 @@ pushd ${THIS}
|
|||
npm install
|
||||
|
||||
node ./index.js --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
|
||||
--lm $HOME/DeepSpeech/models/lm.binary \
|
||||
--trie $HOME/DeepSpeech/models/trie \
|
||||
--scorer $HOME/DeepSpeech/models/kenlm.scorer \
|
||||
--model $HOME/DeepSpeech/models/output_graph.pbmm
|
||||
|
||||
node ./index.js --audio $HOME/DeepSpeech/audio/4507-16021-0012.wav \
|
||||
--lm $HOME/DeepSpeech/models/lm.binary \
|
||||
--trie $HOME/DeepSpeech/models/trie \
|
||||
--scorer $HOME/DeepSpeech/models/kenlm.scorer \
|
||||
--model $HOME/DeepSpeech/models/output_graph.pbmm
|
||||
|
||||
node ./index.js --audio $HOME/DeepSpeech/audio/8455-210777-0068.wav \
|
||||
--lm $HOME/DeepSpeech/models/lm.binary \
|
||||
--trie $HOME/DeepSpeech/models/trie \
|
||||
--scorer $HOME/DeepSpeech/models/kenlm.scorer \
|
||||
--model $HOME/DeepSpeech/models/output_graph.pbmm
|
||||
popd
|
||||
|
|
|
@ -29,9 +29,8 @@ Usage
|
|||
.. code-block::
|
||||
|
||||
usage: mic_vad_streaming.py [-h] [-v VAD_AGGRESSIVENESS] [--nospinner]
|
||||
[-w SAVEWAV] -m MODEL [-l LM]
|
||||
[-t TRIE] [-nf N_FEATURES] [-nc N_CONTEXT]
|
||||
[-la LM_ALPHA] [-lb LM_BETA]
|
||||
[-w SAVEWAV] -m MODEL [-s SCORER]
|
||||
[-nf N_FEATURES] [-nc N_CONTEXT]
|
||||
[-bw BEAM_WIDTH]
|
||||
|
||||
Stream from microphone to DeepSpeech using VAD
|
||||
|
@ -49,21 +48,13 @@ Usage
|
|||
Path to the model (protocol buffer binary file, or
|
||||
entire directory containing all standard-named files
|
||||
for model)
|
||||
-l LM, --lm LM Path to the language model binary file. Default:
|
||||
lm.binary
|
||||
-t TRIE, --trie TRIE Path to the language model trie file created with
|
||||
native_client/generate_trie. Default: trie
|
||||
-s SCORER, --scorer SCORER
|
||||
Path to the external scorer file. Default: kenlm.scorer
|
||||
-nf N_FEATURES, --n_features N_FEATURES
|
||||
Number of MFCC features to use. Default: 26
|
||||
-nc N_CONTEXT, --n_context N_CONTEXT
|
||||
Size of the context window used for producing
|
||||
timesteps in the input vector. Default: 9
|
||||
-la LM_ALPHA, --lm_alpha LM_ALPHA
|
||||
The alpha hyperparameter of the CTC decoder. Language
|
||||
Model weight. Default: 0.75
|
||||
-lb LM_BETA, --lm_beta LM_BETA
|
||||
The beta hyperparameter of the CTC decoder. Word insertion
|
||||
bonus. Default: 1.85
|
||||
-bw BEAM_WIDTH, --beam_width BEAM_WIDTH
|
||||
Beam width used in the CTC decoder when building
|
||||
candidate transcriptions. Default: 500
|
||||
|
|
|
@ -156,16 +156,14 @@ def main(ARGS):
|
|||
if os.path.isdir(ARGS.model):
|
||||
model_dir = ARGS.model
|
||||
ARGS.model = os.path.join(model_dir, 'output_graph.pb')
|
||||
ARGS.lm = os.path.join(model_dir, ARGS.lm)
|
||||
ARGS.trie = os.path.join(model_dir, ARGS.trie)
|
||||
ARGS.scorer = os.path.join(model_dir, ARGS.scorer)
|
||||
|
||||
print('Initializing model...')
|
||||
logging.info("ARGS.model: %s", ARGS.model)
|
||||
model = deepspeech.Model(ARGS.model, ARGS.beam_width)
|
||||
if ARGS.lm and ARGS.trie:
|
||||
logging.info("ARGS.lm: %s", ARGS.lm)
|
||||
logging.info("ARGS.trie: %s", ARGS.trie)
|
||||
model.enableDecoderWithLM(ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta)
|
||||
if ARGS.scorer:
|
||||
logging.info("ARGS.scorer: %s", ARGS.scorer)
|
||||
model.enableExternalScorer(ARGS.scorer)
|
||||
|
||||
# Start audio with VAD
|
||||
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
|
||||
|
@ -185,7 +183,7 @@ def main(ARGS):
|
|||
if frame is not None:
|
||||
if spinner: spinner.start()
|
||||
logging.debug("streaming frame")
|
||||
model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16))
|
||||
stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
|
||||
if ARGS.savewav: wav_data.extend(frame)
|
||||
else:
|
||||
if spinner: spinner.stop()
|
||||
|
@ -193,15 +191,13 @@ def main(ARGS):
|
|||
if ARGS.savewav:
|
||||
vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
|
||||
wav_data = bytearray()
|
||||
text = model.finishStream(stream_context)
|
||||
text = stream_context.finishStream()
|
||||
print("Recognized: %s" % text)
|
||||
stream_context = model.createStream()
|
||||
|
||||
if __name__ == '__main__':
|
||||
BEAM_WIDTH = 500
|
||||
DEFAULT_SAMPLE_RATE = 16000
|
||||
LM_ALPHA = 0.75
|
||||
LM_BETA = 1.85
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
|
||||
|
@ -217,18 +213,12 @@ if __name__ == '__main__':
|
|||
|
||||
parser.add_argument('-m', '--model', required=True,
|
||||
help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
|
||||
parser.add_argument('-l', '--lm', default='lm.binary',
|
||||
help="Path to the language model binary file. Default: lm.binary")
|
||||
parser.add_argument('-t', '--trie', default='trie',
|
||||
help="Path to the language model trie file created with native_client/generate_trie. Default: trie")
|
||||
parser.add_argument('-s', '--scorer', default='kenlm.scorer',
|
||||
help="Path to the external scorer file. Default: kenlm.scorer")
|
||||
parser.add_argument('-d', '--device', type=int, default=None,
|
||||
help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
|
||||
parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
|
||||
help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
|
||||
parser.add_argument('-la', '--lm_alpha', type=float, default=LM_ALPHA,
|
||||
help=f"The alpha hyperparameter of the CTC decoder. Language Model weight. Default: {LM_ALPHA}")
|
||||
parser.add_argument('-lb', '--lm_beta', type=float, default=LM_BETA,
|
||||
help=f"The beta hyperparameter of the CTC decoder. Word insertion bonus. Default: {LM_BETA}")
|
||||
parser.add_argument('-bw', '--beam_width', type=int, default=BEAM_WIDTH,
|
||||
help=f"Beam width used in the CTC decoder when building candidate transcriptions. Default: {BEAM_WIDTH}")
|
||||
|
||||
|
|
|
@ -8,13 +8,12 @@ pushd ${THIS}
|
|||
source ../tests.sh
|
||||
|
||||
pip install --user $(get_python_wheel_url "$1")
|
||||
pip install --user -r requirements.txt
|
||||
pip install --user -r <(grep -v deepspeech requirements.txt)
|
||||
|
||||
pulseaudio &
|
||||
|
||||
python mic_vad_streaming.py \
|
||||
--model $HOME/DeepSpeech/models/output_graph.pbmm \
|
||||
--lm $HOME/DeepSpeech/models/lm.binary \
|
||||
--trie $HOME/DeepSpeech/models/trie \
|
||||
--scorer $HOME/DeepSpeech/models/kenlm.scorer \
|
||||
--file $HOME/DeepSpeech/audio/2830-3980-0043.wav
|
||||
popd
|
||||
|
|
|
@ -52,8 +52,8 @@
|
|||
Margin="95,69,0,0"
|
||||
HorizontalAlignment="Left"
|
||||
VerticalAlignment="Top"
|
||||
Command="{Binding EnableLanguageModelCommand}"
|
||||
Content="Enable LM" />
|
||||
Command="{Binding EnableExternalScorerCommand}"
|
||||
Content="Enable external scorer" />
|
||||
<Button
|
||||
Width="75"
|
||||
Height="25"
|
||||
|
|
|
@ -24,17 +24,16 @@ namespace DeepSpeech.WPF.ViewModels
|
|||
{
|
||||
#region Constants
|
||||
private const int SampleRate = 16000;
|
||||
private const string LMPath = "lm.binary";
|
||||
private const string TriePath = "trie";
|
||||
private const string ScorerPath = "kenlm.scorer";
|
||||
#endregion
|
||||
|
||||
private readonly IDeepSpeech _sttClient;
|
||||
|
||||
#region Commands
|
||||
/// <summary>
|
||||
/// Gets or sets the command that enables the language model.
|
||||
/// Gets or sets the command that enables the external scorer.
|
||||
/// </summary>
|
||||
public IAsyncCommand EnableLanguageModelCommand { get; private set; }
|
||||
public IAsyncCommand EnableExternalScorerCommand { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the command that runs inference using an audio file.
|
||||
|
@ -146,15 +145,15 @@ namespace DeepSpeech.WPF.ViewModels
|
|||
set => SetProperty(ref _statusMessage, value);
|
||||
}
|
||||
|
||||
private bool _languageModelEnabled;
|
||||
private bool _externalScorerEnabled;
|
||||
/// <summary>
|
||||
/// Gets or sets the language model status.
|
||||
/// Gets or sets the external scorer status.
|
||||
/// </summary>
|
||||
private bool LanguageModelEnabled
|
||||
private bool ExternalScorerEnabled
|
||||
{
|
||||
get => _languageModelEnabled;
|
||||
set => SetProperty(ref _languageModelEnabled, value,
|
||||
onChanged: () => ((AsyncCommand)EnableLanguageModelCommand).RaiseCanExecuteChanged());
|
||||
get => _externalScorerEnabled;
|
||||
set => SetProperty(ref _externalScorerEnabled, value,
|
||||
onChanged: () => ((AsyncCommand)EnableExternalScorerCommand).RaiseCanExecuteChanged());
|
||||
}
|
||||
|
||||
private bool _isRunningInference;
|
||||
|
@ -205,8 +204,8 @@ namespace DeepSpeech.WPF.ViewModels
|
|||
{
|
||||
_sttClient = sttClient;
|
||||
|
||||
EnableLanguageModelCommand = new AsyncCommand(()=>EnableLanguageModelAsync(LMPath,TriePath),
|
||||
_ => !LanguageModelEnabled);
|
||||
EnableExternalScorerCommand = new AsyncCommand(()=>EnableExternalScorerAsync(ScorerPath),
|
||||
_ => !ExternalScorerEnabled);
|
||||
|
||||
InferenceFromFileCommand = new AsyncCommand(ExecuteInferenceFromFileAsync,
|
||||
_ => !IsRunningInference);
|
||||
|
@ -322,21 +321,18 @@ namespace DeepSpeech.WPF.ViewModels
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Enables the language model.
|
||||
/// Enables the external scorer.
|
||||
/// </summary>
|
||||
/// <param name="lmPath">Language model path.</param>
|
||||
/// <param name="triePath">Trie path.</param>
|
||||
/// <param name="scorerPath">External scorer path.</param>
|
||||
/// <returns>A Task to await.</returns>
|
||||
public async Task EnableLanguageModelAsync(string lmPath, string triePath)
|
||||
public async Task EnableExternalScorerAsync(string scorerPath)
|
||||
{
|
||||
try
|
||||
{
|
||||
StatusMessage = "Loading language model...";
|
||||
const float LM_ALPHA = 0.75f;
|
||||
const float LM_BETA = 1.85f;
|
||||
await Task.Run(() => _sttClient.EnableDecoderWithLM(LMPath, TriePath, LM_ALPHA, LM_BETA));
|
||||
LanguageModelEnabled = true;
|
||||
StatusMessage = "Language model loaded.";
|
||||
StatusMessage = "Loading external scorer...";
|
||||
await Task.Run(() => _sttClient.EnableExternalScorer(ScorerPath));
|
||||
ExternalScorerEnabled = true;
|
||||
StatusMessage = "External scorer loaded.";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
|
|
|
@ -11,8 +11,7 @@ Edit references to models path if necessary:
|
|||
|
||||
```
|
||||
let modelPath = './models/output_graph.pbmm';
|
||||
let lmPath = './models/lm.binary';
|
||||
let triePath = './models/trie';
|
||||
let scorerPath = './models/kenlm.scorer';
|
||||
```
|
||||
|
||||
Install Sox (for .wav file loading):
|
||||
|
|
|
@ -12,12 +12,9 @@ let model = new DeepSpeech.Model(modelPath, BEAM_WIDTH);
|
|||
|
||||
let desiredSampleRate = model.sampleRate();
|
||||
|
||||
const LM_ALPHA = 0.75;
|
||||
const LM_BETA = 1.85;
|
||||
let lmPath = './models/lm.binary';
|
||||
let triePath = './models/trie';
|
||||
let scorerPath = './models/kenlm.scorer';
|
||||
|
||||
model.enableDecoderWithLM(lmPath, triePath, LM_ALPHA, LM_BETA);
|
||||
model.enableExternalScorer(scorerPath);
|
||||
|
||||
let audioFile = process.argv[2] || './audio/2830-3980-0043.wav';
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ def main(args):
|
|||
parser.add_argument('--audio', required=False,
|
||||
help='Path to the audio file to run (WAV format)')
|
||||
parser.add_argument('--model', required=True,
|
||||
help='Path to directory that contains all model files (output_graph, lm and trie)')
|
||||
help='Path to directory that contains all model files (output_graph and scorer)')
|
||||
parser.add_argument('--stream', required=False, action='store_true',
|
||||
help='To use deepspeech streaming interface')
|
||||
args = parser.parse_args()
|
||||
|
@ -34,13 +34,13 @@ def main(args):
|
|||
dirName = os.path.expanduser(args.model)
|
||||
|
||||
# Resolve all the paths of model files
|
||||
output_graph, lm, trie = wavTranscriber.resolve_models(dirName)
|
||||
output_graph, scorer = wavTranscriber.resolve_models(dirName)
|
||||
|
||||
# Load output_graph, alpahbet, lm and trie
|
||||
model_retval = wavTranscriber.load_model(output_graph, lm, trie)
|
||||
# Load output_graph, alpahbet and scorer
|
||||
model_retval = wavTranscriber.load_model(output_graph, scorer)
|
||||
|
||||
if args.audio is not None:
|
||||
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']
|
||||
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'Scorer Load Time(s)']
|
||||
print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
|
||||
|
||||
inference_time = 0.0
|
||||
|
@ -81,9 +81,9 @@ def main(args):
|
|||
try:
|
||||
while True:
|
||||
data = subproc.stdout.read(512)
|
||||
model_retval[0].feedAudioContent(sctx, np.frombuffer(data, np.int16))
|
||||
sctx.feedAudioContent(np.frombuffer(data, np.int16))
|
||||
except KeyboardInterrupt:
|
||||
print('Transcription: ', model_retval[0].finishStream(sctx))
|
||||
print('Transcription: ', sctx.finishStream())
|
||||
subproc.terminate()
|
||||
subproc.wait()
|
||||
|
||||
|
|
|
@ -109,7 +109,7 @@ class App(QMainWindow):
|
|||
self.microphone = QRadioButton("Microphone")
|
||||
self.fileUpload = QRadioButton("File Upload")
|
||||
self.browseBox = QLineEdit(self, placeholderText="Wave File, Mono @ 16 kHz, 16bit Little-Endian")
|
||||
self.modelsBox = QLineEdit(self, placeholderText="Directory path for output_graph, lm & trie")
|
||||
self.modelsBox = QLineEdit(self, placeholderText="Directory path for output_graph and scorer")
|
||||
self.textboxTranscript = QPlainTextEdit(self, placeholderText="Transcription")
|
||||
self.browseButton = QPushButton('Browse', self)
|
||||
self.browseButton.setToolTip('Select a wav file')
|
||||
|
@ -238,9 +238,9 @@ class App(QMainWindow):
|
|||
|
||||
def modelResult(self, dirName):
|
||||
# Fetch and Resolve all the paths of model files
|
||||
output_graph, lm, trie = wavTranscriber.resolve_models(dirName)
|
||||
# Load output_graph, alpahbet, lm and trie
|
||||
self.model = wavTranscriber.load_model(output_graph, lm, trie)
|
||||
output_graph, scorer = wavTranscriber.resolve_models(dirName)
|
||||
# Load output_graph, alphabet and scorer
|
||||
self.model = wavTranscriber.load_model(output_graph, scorer)
|
||||
|
||||
def modelFinish(self):
|
||||
# self.timer.stop()
|
||||
|
@ -316,9 +316,9 @@ class App(QMainWindow):
|
|||
logging.debug("Recording from your microphone")
|
||||
while (not self.openMicrophone.isChecked()):
|
||||
data = context[1].stdout.read(512)
|
||||
context[2].feedAudioContent(context[0], np.frombuffer(data, np.int16))
|
||||
context[0].feedAudioContent(np.frombuffer(data, np.int16))
|
||||
else:
|
||||
transcript = context[2].finishStream(context[0])
|
||||
transcript = context[0].finishStream()
|
||||
context[1].terminate()
|
||||
context[1].wait()
|
||||
self.show()
|
||||
|
@ -367,7 +367,7 @@ class App(QMainWindow):
|
|||
|
||||
# Format pretty, extract filename from the full file path
|
||||
filename, ext = os.path.split(os.path.basename(waveFile))
|
||||
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']
|
||||
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'Scorer Load Time(s)']
|
||||
logging.debug("************************************************************************************************************")
|
||||
logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
|
||||
logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, self.model[1], self.model[2]))
|
||||
|
|
|
@ -8,7 +8,7 @@ pushd ${THIS}
|
|||
source ../tests.sh
|
||||
|
||||
pip install --user $(get_python_wheel_url "$1")
|
||||
pip install --user -r requirements.txt
|
||||
pip install --user -r <(grep -v deepspeech requirements.txt)
|
||||
|
||||
python audioTranscript_cmd.py \
|
||||
--audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
|
||||
|
|
|
@ -8,28 +8,25 @@ from timeit import default_timer as timer
|
|||
'''
|
||||
Load the pre-trained model into the memory
|
||||
@param models: Output Grapgh Protocol Buffer file
|
||||
@param lm: Language model file
|
||||
@param trie: Trie file
|
||||
@param scorer: Scorer file
|
||||
|
||||
@Retval
|
||||
Returns a list [DeepSpeech Object, Model Load Time, LM Load Time]
|
||||
Returns a list [DeepSpeech Object, Model Load Time, Scorer Load Time]
|
||||
'''
|
||||
def load_model(models, lm, trie):
|
||||
def load_model(models, scorer):
|
||||
BEAM_WIDTH = 500
|
||||
LM_ALPHA = 0.75
|
||||
LM_BETA = 1.85
|
||||
|
||||
model_load_start = timer()
|
||||
ds = Model(models, BEAM_WIDTH)
|
||||
model_load_end = timer() - model_load_start
|
||||
logging.debug("Loaded model in %0.3fs." % (model_load_end))
|
||||
|
||||
lm_load_start = timer()
|
||||
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
|
||||
lm_load_end = timer() - lm_load_start
|
||||
logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))
|
||||
scorer_load_start = timer()
|
||||
ds.enableExternalScorer(scorer)
|
||||
scorer_load_end = timer() - scorer_load_start
|
||||
logging.debug('Loaded external scorer in %0.3fs.' % (scorer_load_end))
|
||||
|
||||
return [ds, model_load_end, lm_load_end]
|
||||
return [ds, model_load_end, scorer_load_end]
|
||||
|
||||
'''
|
||||
Run Inference on input audio file
|
||||
|
@ -60,18 +57,16 @@ Resolve directory path for the models and fetch each of them.
|
|||
@param dirName: Path to the directory containing pre-trained models
|
||||
|
||||
@Retval:
|
||||
Retunns a tuple containing each of the model files (pb, lm and trie)
|
||||
Retunns a tuple containing each of the model files (pb, scorer)
|
||||
'''
|
||||
def resolve_models(dirName):
|
||||
pb = glob.glob(dirName + "/*.pb")[0]
|
||||
logging.debug("Found Model: %s" % pb)
|
||||
|
||||
lm = glob.glob(dirName + "/lm.binary")[0]
|
||||
trie = glob.glob(dirName + "/trie")[0]
|
||||
logging.debug("Found Language Model: %s" % lm)
|
||||
logging.debug("Found Trie: %s" % trie)
|
||||
scorer = glob.glob(dirName + "/kenlm.scorer")[0]
|
||||
logging.debug("Found scorer: %s" % scorer)
|
||||
|
||||
return pb, lm, trie
|
||||
return pb, scorer
|
||||
|
||||
'''
|
||||
Generate VAD segments. Filters out non-voiced audio frames.
|
||||
|
|
|
@ -23,7 +23,7 @@ A list of distributions where the package is available can be found at: https://
|
|||
#### 0.1 Download Deepspeech
|
||||
Either clone from git via git clone, or Download a version from the release page
|
||||
|
||||
For the next steps we assume you have extracted the files to ~/Deepspeech
|
||||
For the next steps we assume you have extracted the files to `~/Deepspeech`
|
||||
|
||||
|
||||
#### 0.2 Setup your environment
|
||||
|
@ -64,7 +64,7 @@ Set the aggressiveness mode, to an integer between 0 and 3.
|
|||
$ python3 audioTranscript_cmd.py --aggressive 1 --audio ./audio/guido-van-rossum.wav --model ./models/0.4.1/
|
||||
|
||||
|
||||
Filename Duration(s) Inference Time(s) Model Load Time(s) LM Load Time(s)
|
||||
Filename Duration(s) Inference Time(s) Model Load Time(s) Scorer Load Time(s)
|
||||
sample_rec.wav 13.710 20.797 5.593 17.742
|
||||
|
||||
```
|
||||
|
|
Загрузка…
Ссылка в новой задаче