v0.6.0 examples
This commit is contained in:
Родитель
1cd56b53de
Коммит
80cafe6bfb
|
@ -0,0 +1,62 @@
|
|||
# FFmpeg VAD Streaming
|
||||
|
||||
Streaming inference from arbitrary source (FFmpeg input) to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Node.js.
|
||||
|
||||
This example was successfully tested with a mobile phone streaming a live feed to a RTMP server (nginx-rtmp), which then could be used by this script for near real time speech recognition.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
Moreover FFmpeg must be installed:
|
||||
|
||||
```bash
|
||||
sudo apt-get install ffmpeg
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Here is an example for a local audio file:
|
||||
```bash
|
||||
node ./index.js --audio <AUDIO_FILE> \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
```
|
||||
|
||||
Here is an example for a remote RTMP-Stream:
|
||||
```bash
|
||||
node ./index.js --audio rtmp://<IP>:1935/live/teststream \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
```
|
||||
|
||||
## Examples
|
||||
Real time streaming inference with DeepSpeech's example audio ([audio-0.4.1.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz)).
|
||||
```bash
|
||||
node ./index.js --audio $HOME/audio/2830-3980-0043.wav \
|
||||
--lm $HOME/models/lm.binary \
|
||||
--trie $HOME/models/trie \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
```
|
||||
```bash
|
||||
node ./index.js --audio $HOME/audio/4507-16021-0012.wav \
|
||||
--lm $HOME/models/lm.binary \
|
||||
--trie $HOME/models/trie \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
```
|
||||
```bash
|
||||
node ./index.js --audio $HOME/audio/8455-210777-0068.wav \
|
||||
--lm $HOME/models/lm.binary \
|
||||
--trie $HOME/models/trie \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
```
|
||||
Real time streaming inference in combination with a RTMP server.
|
||||
```bash
|
||||
node ./index.js --audio rtmp://<HOST>/<APP>/<KEY> \
|
||||
--lm $HOME/models/lm.binary \
|
||||
--trie $HOME/models/trie \
|
||||
--model $HOME/models/output_graph.pbmm \
|
||||
```
|
||||
|
||||
## Notes
|
||||
To get the best result mapped on to your own scenario, it might be helpful to adjust the parameters `VAD_MODE` and `DEBUNCE_TIME`.
|
|
@ -0,0 +1,123 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
const VAD = require("node-vad");
|
||||
const Ds = require('deepspeech');
|
||||
const argparse = require('argparse');
|
||||
const util = require('util');
|
||||
const { spawn } = require('child_process');
|
||||
|
||||
// These constants control the beam search decoder
|
||||
|
||||
// Beam width used in the CTC decoder when building candidate transcriptions
|
||||
const BEAM_WIDTH = 500;
|
||||
|
||||
// The alpha hyperparameter of the CTC decoder. Language Model weight
|
||||
const LM_ALPHA = 0.75;
|
||||
|
||||
// The beta hyperparameter of the CTC decoder. Word insertion bonus.
|
||||
const LM_BETA = 1.85;
|
||||
|
||||
let VersionAction = function VersionAction(options) {
|
||||
options = options || {};
|
||||
options.nargs = 0;
|
||||
argparse.Action.call(this, options);
|
||||
};
|
||||
|
||||
util.inherits(VersionAction, argparse.Action);
|
||||
|
||||
VersionAction.prototype.call = function(parser) {
|
||||
Ds.printVersions();
|
||||
process.exit(0);
|
||||
};
|
||||
|
||||
let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
|
||||
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
|
||||
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
|
||||
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
|
||||
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
|
||||
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
|
||||
let args = parser.parseArgs();
|
||||
|
||||
function totalTime(hrtimeValue) {
|
||||
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
|
||||
}
|
||||
|
||||
console.error('Loading model from file %s', args['model']);
|
||||
const model_load_start = process.hrtime();
|
||||
let model = new Ds.Model(args['model'], BEAM_WIDTH);
|
||||
const model_load_end = process.hrtime(model_load_start);
|
||||
console.error('Loaded model in %ds.', totalTime(model_load_end));
|
||||
|
||||
if (args['lm'] && args['trie']) {
|
||||
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
|
||||
const lm_load_start = process.hrtime();
|
||||
model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
|
||||
const lm_load_end = process.hrtime(lm_load_start);
|
||||
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
|
||||
}
|
||||
|
||||
// Default is 16kHz
|
||||
const AUDIO_SAMPLE_RATE = 16000;
|
||||
|
||||
// Defines different thresholds for voice detection
|
||||
// NORMAL: Suitable for high bitrate, low-noise data. May classify noise as voice, too.
|
||||
// LOW_BITRATE: Detection mode optimised for low-bitrate audio.
|
||||
// AGGRESSIVE: Detection mode best suited for somewhat noisy, lower quality audio.
|
||||
// VERY_AGGRESSIVE: Detection mode with lowest miss-rate. Works well for most inputs.
|
||||
const VAD_MODE = VAD.Mode.NORMAL;
|
||||
// const VAD_MODE = VAD.Mode.LOW_BITRATE;
|
||||
// const VAD_MODE = VAD.Mode.AGGRESSIVE;
|
||||
// const VAD_MODE = VAD.Mode.VERY_AGGRESSIVE;
|
||||
|
||||
// Time in milliseconds for debouncing speech active state
|
||||
const DEBOUNCE_TIME = 20;
|
||||
|
||||
// Create voice activity stream
|
||||
const VAD_STREAM = VAD.createStream({
|
||||
mode: VAD_MODE,
|
||||
audioFrequency: AUDIO_SAMPLE_RATE,
|
||||
debounceTime: DEBOUNCE_TIME
|
||||
});
|
||||
|
||||
// Spawn ffmpeg process
|
||||
const ffmpeg = spawn('ffmpeg', [
|
||||
'-hide_banner',
|
||||
'-nostats',
|
||||
'-loglevel', 'fatal',
|
||||
'-i', args['audio'],
|
||||
'-vn',
|
||||
'-acodec', 'pcm_s16le',
|
||||
'-ac', 1,
|
||||
'-ar', AUDIO_SAMPLE_RATE,
|
||||
'-f', 's16le',
|
||||
'pipe:'
|
||||
]);
|
||||
|
||||
let audioLength = 0;
|
||||
let sctx = model.createStream();
|
||||
|
||||
function finishStream() {
|
||||
const model_load_start = process.hrtime();
|
||||
console.error('Running inference.');
|
||||
console.log('Transcription: ', model.finishStream(sctx));
|
||||
const model_load_end = process.hrtime(model_load_start);
|
||||
console.error('Inference took %ds for %ds audio file.', totalTime(model_load_end), audioLength.toPrecision(4));
|
||||
audioLength = 0;
|
||||
}
|
||||
|
||||
function intermediateDecode() {
|
||||
finishStream();
|
||||
sctx = model.createStream();
|
||||
}
|
||||
|
||||
function feedAudioContent(chunk) {
|
||||
audioLength += (chunk.length / 2) * ( 1 / AUDIO_SAMPLE_RATE);
|
||||
model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
|
||||
}
|
||||
|
||||
function processVad(data) {
|
||||
if (data.speech.start||data.speech.state) feedAudioContent(data.audioData)
|
||||
else if (data.speech.end) { feedAudioContent(data.audioData); intermediateDecode() }
|
||||
}
|
||||
|
||||
ffmpeg.stdout.pipe(VAD_STREAM).on('data', processVad);
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"name": "ffmpeg-vad-streaming",
|
||||
"version": "1.0.0",
|
||||
"description": "Streaming inference from arbitrary source with VAD and FFmpeg",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"start": "node ./index.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"argparse": "^1.0.10",
|
||||
"deepspeech": "0.6.0",
|
||||
"node-vad": "^1.1.1",
|
||||
"util": "^0.11.1"
|
||||
},
|
||||
"license" : "MIT"
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -xe
|
||||
|
||||
THIS=$(dirname "$0")
|
||||
|
||||
pushd ${THIS}
|
||||
source ../tests.sh
|
||||
|
||||
npm install $(get_npm_package_url)
|
||||
npm install
|
||||
|
||||
node ./index.js --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
|
||||
--lm $HOME/DeepSpeech/models/lm.binary \
|
||||
--trie $HOME/DeepSpeech/models/trie \
|
||||
--model $HOME/DeepSpeech/models/output_graph.pbmm
|
||||
|
||||
node ./index.js --audio $HOME/DeepSpeech/audio/4507-16021-0012.wav \
|
||||
--lm $HOME/DeepSpeech/models/lm.binary \
|
||||
--trie $HOME/DeepSpeech/models/trie \
|
||||
--model $HOME/DeepSpeech/models/output_graph.pbmm
|
||||
|
||||
node ./index.js --audio $HOME/DeepSpeech/audio/8455-210777-0068.wav \
|
||||
--lm $HOME/DeepSpeech/models/lm.binary \
|
||||
--trie $HOME/DeepSpeech/models/trie \
|
||||
--model $HOME/DeepSpeech/models/output_graph.pbmm
|
||||
popd
|
|
@ -0,0 +1,69 @@
|
|||
|
||||
Microphone VAD Streaming
|
||||
========================
|
||||
|
||||
Stream from microphone to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Python. Also useful for quick, real-time testing of models and decoding parameters.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
Uses portaudio for microphone access, so on Linux, you may need to install its header files to compile the ``pyaudio`` package:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo apt install portaudio19-dev
|
||||
|
||||
Installation on MacOS may fail due to portaudio, use brew to install it:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
brew install portaudio
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
.. code-block::
|
||||
|
||||
usage: mic_vad_streaming.py [-h] [-v VAD_AGGRESSIVENESS] [--nospinner]
|
||||
[-w SAVEWAV] -m MODEL [-l LM]
|
||||
[-t TRIE] [-nf N_FEATURES] [-nc N_CONTEXT]
|
||||
[-la LM_ALPHA] [-lb LM_BETA]
|
||||
[-bw BEAM_WIDTH]
|
||||
|
||||
Stream from microphone to DeepSpeech using VAD
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v VAD_AGGRESSIVENESS, --vad_aggressiveness VAD_AGGRESSIVENESS
|
||||
Set aggressiveness of VAD: an integer between 0 and 3,
|
||||
0 being the least aggressive about filtering out non-
|
||||
speech, 3 the most aggressive. Default: 3
|
||||
--nospinner Disable spinner
|
||||
-w SAVEWAV, --savewav SAVEWAV
|
||||
Save .wav files of utterences to given directory
|
||||
-m MODEL, --model MODEL
|
||||
Path to the model (protocol buffer binary file, or
|
||||
entire directory containing all standard-named files
|
||||
for model)
|
||||
-l LM, --lm LM Path to the language model binary file. Default:
|
||||
lm.binary
|
||||
-t TRIE, --trie TRIE Path to the language model trie file created with
|
||||
native_client/generate_trie. Default: trie
|
||||
-nf N_FEATURES, --n_features N_FEATURES
|
||||
Number of MFCC features to use. Default: 26
|
||||
-nc N_CONTEXT, --n_context N_CONTEXT
|
||||
Size of the context window used for producing
|
||||
timesteps in the input vector. Default: 9
|
||||
-la LM_ALPHA, --lm_alpha LM_ALPHA
|
||||
The alpha hyperparameter of the CTC decoder. Language
|
||||
Model weight. Default: 0.75
|
||||
-lb LM_BETA, --lm_beta LM_BETA
|
||||
The beta hyperparameter of the CTC decoder. Word insertion
|
||||
bonus. Default: 1.85
|
||||
-bw BEAM_WIDTH, --beam_width BEAM_WIDTH
|
||||
Beam width used in the CTC decoder when building
|
||||
candidate transcriptions. Default: 500
|
|
@ -0,0 +1,237 @@
|
|||
import time, logging
|
||||
from datetime import datetime
|
||||
import threading, collections, queue, os, os.path
|
||||
import deepspeech
|
||||
import numpy as np
|
||||
import pyaudio
|
||||
import wave
|
||||
import webrtcvad
|
||||
from halo import Halo
|
||||
from scipy import signal
|
||||
|
||||
logging.basicConfig(level=20)
|
||||
|
||||
class Audio(object):
|
||||
"""Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
|
||||
|
||||
FORMAT = pyaudio.paInt16
|
||||
# Network/VAD rate-space
|
||||
RATE_PROCESS = 16000
|
||||
CHANNELS = 1
|
||||
BLOCKS_PER_SECOND = 50
|
||||
|
||||
def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS, file=None):
|
||||
def proxy_callback(in_data, frame_count, time_info, status):
|
||||
#pylint: disable=unused-argument
|
||||
if self.chunk is not None:
|
||||
in_data = self.wf.readframes(self.chunk)
|
||||
callback(in_data)
|
||||
return (None, pyaudio.paContinue)
|
||||
if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)
|
||||
self.buffer_queue = queue.Queue()
|
||||
self.device = device
|
||||
self.input_rate = input_rate
|
||||
self.sample_rate = self.RATE_PROCESS
|
||||
self.block_size = int(self.RATE_PROCESS / float(self.BLOCKS_PER_SECOND))
|
||||
self.block_size_input = int(self.input_rate / float(self.BLOCKS_PER_SECOND))
|
||||
self.pa = pyaudio.PyAudio()
|
||||
|
||||
kwargs = {
|
||||
'format': self.FORMAT,
|
||||
'channels': self.CHANNELS,
|
||||
'rate': self.input_rate,
|
||||
'input': True,
|
||||
'frames_per_buffer': self.block_size_input,
|
||||
'stream_callback': proxy_callback,
|
||||
}
|
||||
|
||||
self.chunk = None
|
||||
# if not default device
|
||||
if self.device:
|
||||
kwargs['input_device_index'] = self.device
|
||||
elif file is not None:
|
||||
self.chunk = 320
|
||||
self.wf = wave.open(file, 'rb')
|
||||
|
||||
self.stream = self.pa.open(**kwargs)
|
||||
self.stream.start_stream()
|
||||
|
||||
def resample(self, data, input_rate):
|
||||
"""
|
||||
Microphone may not support our native processing sampling rate, so
|
||||
resample from input_rate to RATE_PROCESS here for webrtcvad and
|
||||
deepspeech
|
||||
|
||||
Args:
|
||||
data (binary): Input audio stream
|
||||
input_rate (int): Input audio rate to resample from
|
||||
"""
|
||||
data16 = np.fromstring(string=data, dtype=np.int16)
|
||||
resample_size = int(len(data16) / self.input_rate * self.RATE_PROCESS)
|
||||
resample = signal.resample(data16, resample_size)
|
||||
resample16 = np.array(resample, dtype=np.int16)
|
||||
return resample16.tostring()
|
||||
|
||||
def read_resampled(self):
|
||||
"""Return a block of audio data resampled to 16000hz, blocking if necessary."""
|
||||
return self.resample(data=self.buffer_queue.get(),
|
||||
input_rate=self.input_rate)
|
||||
|
||||
def read(self):
|
||||
"""Return a block of audio data, blocking if necessary."""
|
||||
return self.buffer_queue.get()
|
||||
|
||||
def destroy(self):
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.pa.terminate()
|
||||
|
||||
frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)
|
||||
|
||||
def write_wav(self, filename, data):
|
||||
logging.info("write wav %s", filename)
|
||||
wf = wave.open(filename, 'wb')
|
||||
wf.setnchannels(self.CHANNELS)
|
||||
# wf.setsampwidth(self.pa.get_sample_size(FORMAT))
|
||||
assert self.FORMAT == pyaudio.paInt16
|
||||
wf.setsampwidth(2)
|
||||
wf.setframerate(self.sample_rate)
|
||||
wf.writeframes(data)
|
||||
wf.close()
|
||||
|
||||
|
||||
class VADAudio(Audio):
|
||||
"""Filter & segment audio with voice activity detection."""
|
||||
|
||||
def __init__(self, aggressiveness=3, device=None, input_rate=None, file=None):
|
||||
super().__init__(device=device, input_rate=input_rate, file=file)
|
||||
self.vad = webrtcvad.Vad(aggressiveness)
|
||||
|
||||
def frame_generator(self):
|
||||
"""Generator that yields all audio frames from microphone."""
|
||||
if self.input_rate == self.RATE_PROCESS:
|
||||
while True:
|
||||
yield self.read()
|
||||
else:
|
||||
while True:
|
||||
yield self.read_resampled()
|
||||
|
||||
def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):
|
||||
"""Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
|
||||
Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
|
||||
Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
|
||||
|---utterence---| |---utterence---|
|
||||
"""
|
||||
if frames is None: frames = self.frame_generator()
|
||||
num_padding_frames = padding_ms // self.frame_duration_ms
|
||||
ring_buffer = collections.deque(maxlen=num_padding_frames)
|
||||
triggered = False
|
||||
|
||||
for frame in frames:
|
||||
if len(frame) < 640:
|
||||
return
|
||||
|
||||
is_speech = self.vad.is_speech(frame, self.sample_rate)
|
||||
|
||||
if not triggered:
|
||||
ring_buffer.append((frame, is_speech))
|
||||
num_voiced = len([f for f, speech in ring_buffer if speech])
|
||||
if num_voiced > ratio * ring_buffer.maxlen:
|
||||
triggered = True
|
||||
for f, s in ring_buffer:
|
||||
yield f
|
||||
ring_buffer.clear()
|
||||
|
||||
else:
|
||||
yield frame
|
||||
ring_buffer.append((frame, is_speech))
|
||||
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
|
||||
if num_unvoiced > ratio * ring_buffer.maxlen:
|
||||
triggered = False
|
||||
yield None
|
||||
ring_buffer.clear()
|
||||
|
||||
def main(ARGS):
|
||||
# Load DeepSpeech model
|
||||
if os.path.isdir(ARGS.model):
|
||||
model_dir = ARGS.model
|
||||
ARGS.model = os.path.join(model_dir, 'output_graph.pb')
|
||||
ARGS.lm = os.path.join(model_dir, ARGS.lm)
|
||||
ARGS.trie = os.path.join(model_dir, ARGS.trie)
|
||||
|
||||
print('Initializing model...')
|
||||
logging.info("ARGS.model: %s", ARGS.model)
|
||||
model = deepspeech.Model(ARGS.model, ARGS.beam_width)
|
||||
if ARGS.lm and ARGS.trie:
|
||||
logging.info("ARGS.lm: %s", ARGS.lm)
|
||||
logging.info("ARGS.trie: %s", ARGS.trie)
|
||||
model.enableDecoderWithLM(ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta)
|
||||
|
||||
# Start audio with VAD
|
||||
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
|
||||
device=ARGS.device,
|
||||
input_rate=ARGS.rate,
|
||||
file=ARGS.file)
|
||||
print("Listening (ctrl-C to exit)...")
|
||||
frames = vad_audio.vad_collector()
|
||||
|
||||
# Stream from microphone to DeepSpeech using VAD
|
||||
spinner = None
|
||||
if not ARGS.nospinner:
|
||||
spinner = Halo(spinner='line')
|
||||
stream_context = model.createStream()
|
||||
wav_data = bytearray()
|
||||
for frame in frames:
|
||||
if frame is not None:
|
||||
if spinner: spinner.start()
|
||||
logging.debug("streaming frame")
|
||||
model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16))
|
||||
if ARGS.savewav: wav_data.extend(frame)
|
||||
else:
|
||||
if spinner: spinner.stop()
|
||||
logging.debug("end utterence")
|
||||
if ARGS.savewav:
|
||||
vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
|
||||
wav_data = bytearray()
|
||||
text = model.finishStream(stream_context)
|
||||
print("Recognized: %s" % text)
|
||||
stream_context = model.createStream()
|
||||
|
||||
if __name__ == '__main__':
|
||||
BEAM_WIDTH = 500
|
||||
DEFAULT_SAMPLE_RATE = 16000
|
||||
LM_ALPHA = 0.75
|
||||
LM_BETA = 1.85
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
|
||||
|
||||
parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
|
||||
help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
|
||||
parser.add_argument('--nospinner', action='store_true',
|
||||
help="Disable spinner")
|
||||
parser.add_argument('-w', '--savewav',
|
||||
help="Save .wav files of utterences to given directory")
|
||||
parser.add_argument('-f', '--file',
|
||||
help="Read from .wav file instead of microphone")
|
||||
|
||||
parser.add_argument('-m', '--model', required=True,
|
||||
help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
|
||||
parser.add_argument('-l', '--lm', default='lm.binary',
|
||||
help="Path to the language model binary file. Default: lm.binary")
|
||||
parser.add_argument('-t', '--trie', default='trie',
|
||||
help="Path to the language model trie file created with native_client/generate_trie. Default: trie")
|
||||
parser.add_argument('-d', '--device', type=int, default=None,
|
||||
help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
|
||||
parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
|
||||
help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
|
||||
parser.add_argument('-la', '--lm_alpha', type=float, default=LM_ALPHA,
|
||||
help=f"The alpha hyperparameter of the CTC decoder. Language Model weight. Default: {LM_ALPHA}")
|
||||
parser.add_argument('-lb', '--lm_beta', type=float, default=LM_BETA,
|
||||
help=f"The beta hyperparameter of the CTC decoder. Word insertion bonus. Default: {LM_BETA}")
|
||||
parser.add_argument('-bw', '--beam_width', type=int, default=BEAM_WIDTH,
|
||||
help=f"Beam width used in the CTC decoder when building candidate transcriptions. Default: {BEAM_WIDTH}")
|
||||
|
||||
ARGS = parser.parse_args()
|
||||
if ARGS.savewav: os.makedirs(ARGS.savewav, exist_ok=True)
|
||||
main(ARGS)
|
|
@ -0,0 +1,6 @@
|
|||
deepspeech==0.6.0
|
||||
pyaudio~=0.2.11
|
||||
webrtcvad~=2.0.10
|
||||
halo~=0.0.18
|
||||
numpy>=1.15.1
|
||||
scipy>=1.1.0
|
|
@ -0,0 +1,20 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -xe
|
||||
|
||||
THIS=$(dirname "$0")
|
||||
|
||||
pushd ${THIS}
|
||||
source ../tests.sh
|
||||
|
||||
pip install --user $(get_python_wheel_url "$1")
|
||||
pip install --user -r requirements.txt
|
||||
|
||||
pulseaudio &
|
||||
|
||||
python mic_vad_streaming.py \
|
||||
--model $HOME/DeepSpeech/models/output_graph.pbmm \
|
||||
--lm $HOME/DeepSpeech/models/lm.binary \
|
||||
--trie $HOME/DeepSpeech/models/trie \
|
||||
--file $HOME/DeepSpeech/audio/2830-3980-0043.wav
|
||||
popd
|
|
@ -0,0 +1,330 @@
|
|||
## Ignore Visual Studio temporary files, build results, and
|
||||
## files generated by popular Visual Studio add-ons.
|
||||
##
|
||||
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
||||
|
||||
# User-specific files
|
||||
*.suo
|
||||
*.user
|
||||
*.userosscache
|
||||
*.sln.docstates
|
||||
|
||||
# User-specific files (MonoDevelop/Xamarin Studio)
|
||||
*.userprefs
|
||||
|
||||
# Build results
|
||||
[Dd]ebug/
|
||||
[Dd]ebugPublic/
|
||||
[Rr]elease/
|
||||
[Rr]eleases/
|
||||
x64/
|
||||
x86/
|
||||
bld/
|
||||
[Bb]in/
|
||||
[Oo]bj/
|
||||
[Ll]og/
|
||||
|
||||
# Visual Studio 2015/2017 cache/options directory
|
||||
.vs/
|
||||
# Uncomment if you have tasks that create the project's static files in wwwroot
|
||||
#wwwroot/
|
||||
|
||||
# Visual Studio 2017 auto generated files
|
||||
Generated\ Files/
|
||||
|
||||
# MSTest test Results
|
||||
[Tt]est[Rr]esult*/
|
||||
[Bb]uild[Ll]og.*
|
||||
|
||||
# NUNIT
|
||||
*.VisualState.xml
|
||||
TestResult.xml
|
||||
|
||||
# Build Results of an ATL Project
|
||||
[Dd]ebugPS/
|
||||
[Rr]eleasePS/
|
||||
dlldata.c
|
||||
|
||||
# Benchmark Results
|
||||
BenchmarkDotNet.Artifacts/
|
||||
|
||||
# .NET Core
|
||||
project.lock.json
|
||||
project.fragment.lock.json
|
||||
artifacts/
|
||||
**/Properties/launchSettings.json
|
||||
|
||||
# StyleCop
|
||||
StyleCopReport.xml
|
||||
|
||||
# Files built by Visual Studio
|
||||
*_i.c
|
||||
*_p.c
|
||||
*_i.h
|
||||
*.ilk
|
||||
*.meta
|
||||
*.obj
|
||||
*.iobj
|
||||
*.pch
|
||||
*.pdb
|
||||
*.ipdb
|
||||
*.pgc
|
||||
*.pgd
|
||||
*.rsp
|
||||
*.sbr
|
||||
*.tlb
|
||||
*.tli
|
||||
*.tlh
|
||||
*.tmp
|
||||
*.tmp_proj
|
||||
*.log
|
||||
*.vspscc
|
||||
*.vssscc
|
||||
.builds
|
||||
*.pidb
|
||||
*.svclog
|
||||
*.scc
|
||||
|
||||
# Chutzpah Test files
|
||||
_Chutzpah*
|
||||
|
||||
# Visual C++ cache files
|
||||
ipch/
|
||||
*.aps
|
||||
*.ncb
|
||||
*.opendb
|
||||
*.opensdf
|
||||
*.sdf
|
||||
*.cachefile
|
||||
*.VC.db
|
||||
*.VC.VC.opendb
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
*.vsp
|
||||
*.vspx
|
||||
*.sap
|
||||
|
||||
# Visual Studio Trace Files
|
||||
*.e2e
|
||||
|
||||
# TFS 2012 Local Workspace
|
||||
$tf/
|
||||
|
||||
# Guidance Automation Toolkit
|
||||
*.gpState
|
||||
|
||||
# ReSharper is a .NET coding add-in
|
||||
_ReSharper*/
|
||||
*.[Rr]e[Ss]harper
|
||||
*.DotSettings.user
|
||||
|
||||
# JustCode is a .NET coding add-in
|
||||
.JustCode
|
||||
|
||||
# TeamCity is a build add-in
|
||||
_TeamCity*
|
||||
|
||||
# DotCover is a Code Coverage Tool
|
||||
*.dotCover
|
||||
|
||||
# AxoCover is a Code Coverage Tool
|
||||
.axoCover/*
|
||||
!.axoCover/settings.json
|
||||
|
||||
# Visual Studio code coverage results
|
||||
*.coverage
|
||||
*.coveragexml
|
||||
|
||||
# NCrunch
|
||||
_NCrunch_*
|
||||
.*crunch*.local.xml
|
||||
nCrunchTemp_*
|
||||
|
||||
# MightyMoose
|
||||
*.mm.*
|
||||
AutoTest.Net/
|
||||
|
||||
# Web workbench (sass)
|
||||
.sass-cache/
|
||||
|
||||
# Installshield output folder
|
||||
[Ee]xpress/
|
||||
|
||||
# DocProject is a documentation generator add-in
|
||||
DocProject/buildhelp/
|
||||
DocProject/Help/*.HxT
|
||||
DocProject/Help/*.HxC
|
||||
DocProject/Help/*.hhc
|
||||
DocProject/Help/*.hhk
|
||||
DocProject/Help/*.hhp
|
||||
DocProject/Help/Html2
|
||||
DocProject/Help/html
|
||||
|
||||
# Click-Once directory
|
||||
publish/
|
||||
|
||||
# Publish Web Output
|
||||
*.[Pp]ublish.xml
|
||||
*.azurePubxml
|
||||
# Note: Comment the next line if you want to checkin your web deploy settings,
|
||||
# but database connection strings (with potential passwords) will be unencrypted
|
||||
*.pubxml
|
||||
*.publishproj
|
||||
|
||||
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
||||
# checkin your Azure Web App publish settings, but sensitive information contained
|
||||
# in these scripts will be unencrypted
|
||||
PublishScripts/
|
||||
|
||||
# NuGet Packages
|
||||
*.nupkg
|
||||
# The packages folder can be ignored because of Package Restore
|
||||
**/[Pp]ackages/*
|
||||
# except build/, which is used as an MSBuild target.
|
||||
!**/[Pp]ackages/build/
|
||||
# Uncomment if necessary however generally it will be regenerated when needed
|
||||
#!**/[Pp]ackages/repositories.config
|
||||
# NuGet v3's project.json files produces more ignorable files
|
||||
*.nuget.props
|
||||
*.nuget.targets
|
||||
|
||||
# Microsoft Azure Build Output
|
||||
csx/
|
||||
*.build.csdef
|
||||
|
||||
# Microsoft Azure Emulator
|
||||
ecf/
|
||||
rcf/
|
||||
|
||||
# Windows Store app package directories and files
|
||||
AppPackages/
|
||||
BundleArtifacts/
|
||||
Package.StoreAssociation.xml
|
||||
_pkginfo.txt
|
||||
*.appx
|
||||
|
||||
# Visual Studio cache files
|
||||
# files ending in .cache can be ignored
|
||||
*.[Cc]ache
|
||||
# but keep track of directories ending in .cache
|
||||
!*.[Cc]ache/
|
||||
|
||||
# Others
|
||||
ClientBin/
|
||||
~$*
|
||||
*~
|
||||
*.dbmdl
|
||||
*.dbproj.schemaview
|
||||
*.jfm
|
||||
*.pfx
|
||||
*.publishsettings
|
||||
orleans.codegen.cs
|
||||
|
||||
# Including strong name files can present a security risk
|
||||
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
||||
#*.snk
|
||||
|
||||
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
||||
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
||||
#bower_components/
|
||||
|
||||
# RIA/Silverlight projects
|
||||
Generated_Code/
|
||||
|
||||
# Backup & report files from converting an old project file
|
||||
# to a newer Visual Studio version. Backup files are not needed,
|
||||
# because we have git ;-)
|
||||
_UpgradeReport_Files/
|
||||
Backup*/
|
||||
UpgradeLog*.XML
|
||||
UpgradeLog*.htm
|
||||
ServiceFabricBackup/
|
||||
*.rptproj.bak
|
||||
|
||||
# SQL Server files
|
||||
*.mdf
|
||||
*.ldf
|
||||
*.ndf
|
||||
|
||||
# Business Intelligence projects
|
||||
*.rdl.data
|
||||
*.bim.layout
|
||||
*.bim_*.settings
|
||||
*.rptproj.rsuser
|
||||
|
||||
# Microsoft Fakes
|
||||
FakesAssemblies/
|
||||
|
||||
# GhostDoc plugin setting file
|
||||
*.GhostDoc.xml
|
||||
|
||||
# Node.js Tools for Visual Studio
|
||||
.ntvs_analysis.dat
|
||||
node_modules/
|
||||
|
||||
# Visual Studio 6 build log
|
||||
*.plg
|
||||
|
||||
# Visual Studio 6 workspace options file
|
||||
*.opt
|
||||
|
||||
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
||||
*.vbw
|
||||
|
||||
# Visual Studio LightSwitch build output
|
||||
**/*.HTMLClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/ModelManifest.xml
|
||||
**/*.Server/GeneratedArtifacts
|
||||
**/*.Server/ModelManifest.xml
|
||||
_Pvt_Extensions
|
||||
|
||||
# Paket dependency manager
|
||||
.paket/paket.exe
|
||||
paket-files/
|
||||
|
||||
# FAKE - F# Make
|
||||
.fake/
|
||||
|
||||
# JetBrains Rider
|
||||
.idea/
|
||||
*.sln.iml
|
||||
|
||||
# CodeRush
|
||||
.cr/
|
||||
|
||||
# Python Tools for Visual Studio (PTVS)
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Cake - Uncomment if you are using it
|
||||
# tools/**
|
||||
# !tools/packages.config
|
||||
|
||||
# Tabs Studio
|
||||
*.tss
|
||||
|
||||
# Telerik's JustMock configuration file
|
||||
*.jmconfig
|
||||
|
||||
# BizTalk build output
|
||||
*.btp.cs
|
||||
*.btm.cs
|
||||
*.odx.cs
|
||||
*.xsd.cs
|
||||
|
||||
# OpenCover UI analysis results
|
||||
OpenCover/
|
||||
|
||||
# Azure Stream Analytics local run output
|
||||
ASALocalRun/
|
||||
|
||||
# MSBuild Binary and Structured Log
|
||||
*.binlog
|
||||
|
||||
# NVidia Nsight GPU debugger configuration file
|
||||
*.nvuser
|
||||
|
||||
# MFractors (Xamarin productivity tool) working folder
|
||||
.mfractor/
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<configuration>
|
||||
<startup>
|
||||
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.6.2" />
|
||||
</startup>
|
||||
</configuration>
|
|
@ -0,0 +1,8 @@
|
|||
<Application
|
||||
x:Class="DeepSpeechWPF.App"
|
||||
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
|
||||
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
|
||||
xmlns:local="clr-namespace:DeepSpeechWPF"
|
||||
StartupUri="MainWindow.xaml">
|
||||
<Application.Resources />
|
||||
</Application>
|
|
@ -0,0 +1,44 @@
|
|||
using CommonServiceLocator;
|
||||
using DeepSpeech.WPF.ViewModels;
|
||||
using DeepSpeechClient.Interfaces;
|
||||
using GalaSoft.MvvmLight.Ioc;
|
||||
using System.Windows;
|
||||
|
||||
namespace DeepSpeechWPF
|
||||
{
|
||||
/// <summary>
|
||||
/// Interaction logic for App.xaml
|
||||
/// </summary>
|
||||
public partial class App : Application
|
||||
{
|
||||
protected override void OnStartup(StartupEventArgs e)
|
||||
{
|
||||
base.OnStartup(e);
|
||||
ServiceLocator.SetLocatorProvider(() => SimpleIoc.Default);
|
||||
|
||||
const int BEAM_WIDTH = 500;
|
||||
|
||||
//Register instance of DeepSpeech
|
||||
DeepSpeechClient.DeepSpeech deepSpeechClient = new DeepSpeechClient.DeepSpeech();
|
||||
try
|
||||
{
|
||||
deepSpeechClient.CreateModel("output_graph.pbmm", BEAM_WIDTH);
|
||||
}
|
||||
catch (System.Exception ex)
|
||||
{
|
||||
MessageBox.Show(ex.Message);
|
||||
Current.Shutdown();
|
||||
}
|
||||
|
||||
SimpleIoc.Default.Register<IDeepSpeech>(() => deepSpeechClient);
|
||||
SimpleIoc.Default.Register<MainWindowViewModel>();
|
||||
}
|
||||
|
||||
protected override void OnExit(ExitEventArgs e)
|
||||
{
|
||||
base.OnExit(e);
|
||||
//Dispose instance of DeepSpeech
|
||||
ServiceLocator.Current.GetInstance<IDeepSpeech>()?.Dispose();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,140 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
|
||||
<PropertyGroup>
|
||||
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
||||
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
|
||||
<ProjectGuid>{54BFD766-4305-4F4C-BA59-AF45505DF3C1}</ProjectGuid>
|
||||
<OutputType>WinExe</OutputType>
|
||||
<RootNamespace>DeepSpeech.WPF</RootNamespace>
|
||||
<AssemblyName>DeepSpeech.WPF</AssemblyName>
|
||||
<TargetFrameworkVersion>v4.6.2</TargetFrameworkVersion>
|
||||
<FileAlignment>512</FileAlignment>
|
||||
<ProjectTypeGuids>{60dc8134-eba5-43b8-bcc9-bb4bc16c2548};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
|
||||
<Deterministic>true</Deterministic>
|
||||
<NuGetPackageImportStamp>
|
||||
</NuGetPackageImportStamp>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
|
||||
<DebugSymbols>true</DebugSymbols>
|
||||
<OutputPath>bin\x64\Debug\</OutputPath>
|
||||
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
||||
<DebugType>full</DebugType>
|
||||
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
|
||||
<Prefer32Bit>false</Prefer32Bit>
|
||||
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'">
|
||||
<OutputPath>bin\x64\Release\</OutputPath>
|
||||
<DefineConstants>TRACE</DefineConstants>
|
||||
<Optimize>true</Optimize>
|
||||
<DebugType>pdbonly</DebugType>
|
||||
<PlatformTarget>x64</PlatformTarget>
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
|
||||
<Prefer32Bit>true</Prefer32Bit>
|
||||
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="AsyncAwaitBestPractices, Version=1.0.0.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<HintPath>packages\AsyncAwaitBestPractices.3.1.0\lib\netstandard1.0\AsyncAwaitBestPractices.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="AsyncAwaitBestPractices.MVVM, Version=1.0.0.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<HintPath>packages\AsyncAwaitBestPractices.MVVM.3.1.0\lib\netstandard1.0\AsyncAwaitBestPractices.MVVM.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="CommonServiceLocator, Version=2.0.2.0, Culture=neutral, PublicKeyToken=489b6accfaf20ef0, processorArchitecture=MSIL">
|
||||
<HintPath>packages\CommonServiceLocator.2.0.2\lib\net45\CommonServiceLocator.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="CSCore, Version=1.2.1.2, Culture=neutral, PublicKeyToken=5a08f2b6f4415dea, processorArchitecture=MSIL">
|
||||
<HintPath>packages\CSCore.1.2.1.2\lib\net35-client\CSCore.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="GalaSoft.MvvmLight, Version=5.4.1.0, Culture=neutral, PublicKeyToken=e7570ab207bcb616, processorArchitecture=MSIL">
|
||||
<HintPath>packages\MvvmLightLibs.5.4.1.1\lib\net45\GalaSoft.MvvmLight.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="GalaSoft.MvvmLight.Extras, Version=5.4.1.0, Culture=neutral, PublicKeyToken=669f0b5e8f868abf, processorArchitecture=MSIL">
|
||||
<HintPath>packages\MvvmLightLibs.5.4.1.1\lib\net45\GalaSoft.MvvmLight.Extras.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="GalaSoft.MvvmLight.Platform, Version=5.4.1.0, Culture=neutral, PublicKeyToken=5f873c45e98af8a1, processorArchitecture=MSIL">
|
||||
<HintPath>packages\MvvmLightLibs.5.4.1.1\lib\net45\GalaSoft.MvvmLight.Platform.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="NAudio, Version=1.9.0.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<HintPath>packages\NAudio.1.9.0\lib\net35\NAudio.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="System" />
|
||||
<Reference Include="System.Data" />
|
||||
<Reference Include="System.Windows.Forms" />
|
||||
<Reference Include="System.Windows.Interactivity, Version=4.5.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
|
||||
<HintPath>packages\MvvmLightLibs.5.4.1.1\lib\net45\System.Windows.Interactivity.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="System.Xml" />
|
||||
<Reference Include="Microsoft.CSharp" />
|
||||
<Reference Include="System.Core" />
|
||||
<Reference Include="System.Xml.Linq" />
|
||||
<Reference Include="System.Data.DataSetExtensions" />
|
||||
<Reference Include="System.Net.Http" />
|
||||
<Reference Include="System.Xaml">
|
||||
<RequiredTargetFramework>4.0</RequiredTargetFramework>
|
||||
</Reference>
|
||||
<Reference Include="WindowsBase" />
|
||||
<Reference Include="PresentationCore" />
|
||||
<Reference Include="PresentationFramework" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ApplicationDefinition Include="App.xaml">
|
||||
<Generator>MSBuild:Compile</Generator>
|
||||
<SubType>Designer</SubType>
|
||||
</ApplicationDefinition>
|
||||
<Compile Include="ViewModels\MainWindowViewModel.cs" />
|
||||
<Page Include="MainWindow.xaml">
|
||||
<Generator>MSBuild:Compile</Generator>
|
||||
<SubType>Designer</SubType>
|
||||
</Page>
|
||||
<Compile Include="App.xaml.cs">
|
||||
<DependentUpon>App.xaml</DependentUpon>
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="ViewModels\BindableBase.cs" />
|
||||
<Compile Include="MainWindow.xaml.cs">
|
||||
<DependentUpon>MainWindow.xaml</DependentUpon>
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Compile Include="Properties\AssemblyInfo.cs">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="Properties\Resources.Designer.cs">
|
||||
<AutoGen>True</AutoGen>
|
||||
<DesignTime>True</DesignTime>
|
||||
<DependentUpon>Resources.resx</DependentUpon>
|
||||
</Compile>
|
||||
<Compile Include="Properties\Settings.Designer.cs">
|
||||
<AutoGen>True</AutoGen>
|
||||
<DependentUpon>Settings.settings</DependentUpon>
|
||||
<DesignTimeSharedInput>True</DesignTimeSharedInput>
|
||||
</Compile>
|
||||
<EmbeddedResource Include="Properties\Resources.resx">
|
||||
<Generator>ResXFileCodeGenerator</Generator>
|
||||
<LastGenOutput>Resources.Designer.cs</LastGenOutput>
|
||||
</EmbeddedResource>
|
||||
<None Include="packages.config" />
|
||||
<None Include="Properties\Settings.settings">
|
||||
<Generator>SettingsSingleFileGenerator</Generator>
|
||||
<LastGenOutput>Settings.Designer.cs</LastGenOutput>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="App.config" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\..\native_client\dotnet\DeepSpeechClient\DeepSpeechClient.csproj">
|
||||
<Project>{56de4091-bbbe-47e4-852d-7268b33b971f}</Project>
|
||||
<Name>DeepSpeechClient</Name>
|
||||
</ProjectReference>
|
||||
</ItemGroup>
|
||||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||
</Project>
|
|
@ -0,0 +1,31 @@
|
|||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 15
|
||||
VisualStudioVersion = 15.0.28307.421
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeech.WPF", "DeepSpeech.WPF.csproj", "{54BFD766-4305-4F4C-BA59-AF45505DF3C1}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechClient", "..\..\..\native_client\dotnet\DeepSpeechClient\DeepSpeechClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|x64 = Debug|x64
|
||||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Debug|x64.Build.0 = Debug|x64
|
||||
{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Release|x64.ActiveCfg = Release|x64
|
||||
{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Release|x64.Build.0 = Release|x64
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {19C58802-CCEC-4FD1-8D17-A6EB766116F7}
|
||||
EndGlobalSection
|
||||
EndGlobal
|
|
@ -0,0 +1,102 @@
|
|||
<Window
|
||||
x:Class="DeepSpeechWPF.MainWindow"
|
||||
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
|
||||
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
|
||||
xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
|
||||
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
||||
Title="Deepspeech client"
|
||||
Width="800"
|
||||
Height="600"
|
||||
Loaded="Window_Loaded"
|
||||
WindowStartupLocation="CenterScreen"
|
||||
mc:Ignorable="d">
|
||||
<Grid>
|
||||
<Grid.RowDefinitions>
|
||||
<RowDefinition Height="222" />
|
||||
<RowDefinition />
|
||||
</Grid.RowDefinitions>
|
||||
<TextBox
|
||||
Grid.Row="1"
|
||||
Margin="10,36,10,10"
|
||||
FontSize="16px"
|
||||
Text="{Binding Transcription, Mode=OneWay}"
|
||||
TextWrapping="Wrap" />
|
||||
<Label
|
||||
Grid.Row="1"
|
||||
Height="26"
|
||||
Margin="10,5,10,0"
|
||||
VerticalAlignment="Top"
|
||||
Content="Results:" />
|
||||
<Label
|
||||
Height="26"
|
||||
Margin="10,10,10,0"
|
||||
VerticalAlignment="Top"
|
||||
Content="Select an audio file to transcript:" />
|
||||
<TextBox
|
||||
Height="23"
|
||||
Margin="10,41,10,0"
|
||||
VerticalAlignment="Top"
|
||||
Text="{Binding AudioFilePath, Mode=TwoWay}"
|
||||
TextWrapping="Wrap" />
|
||||
<Button
|
||||
Width="80"
|
||||
Height="25"
|
||||
Margin="10,69,0,0"
|
||||
HorizontalAlignment="Left"
|
||||
VerticalAlignment="Top"
|
||||
Command="{Binding SelectFileCommand}"
|
||||
Content="Open file" />
|
||||
<Button
|
||||
Width="82"
|
||||
Height="25"
|
||||
Margin="95,69,0,0"
|
||||
HorizontalAlignment="Left"
|
||||
VerticalAlignment="Top"
|
||||
Command="{Binding EnableLanguageModelCommand}"
|
||||
Content="Enable LM" />
|
||||
<Button
|
||||
Width="75"
|
||||
Height="25"
|
||||
Margin="182,69,0,0"
|
||||
HorizontalAlignment="Left"
|
||||
VerticalAlignment="Top"
|
||||
Command="{Binding InferenceFromFileCommand}"
|
||||
Content="Transcript" />
|
||||
<Label
|
||||
Height="30"
|
||||
Margin="10,99,10,0"
|
||||
VerticalAlignment="Top"
|
||||
Content="{Binding StatusMessage, Mode=OneWay}" />
|
||||
<Label
|
||||
Height="26"
|
||||
Margin="10,158,10,0"
|
||||
VerticalAlignment="Top"
|
||||
Content="Select an audio input:" />
|
||||
<ComboBox
|
||||
Height="23"
|
||||
Margin="20,189,186,0"
|
||||
VerticalAlignment="Top"
|
||||
DisplayMemberPath="FriendlyName"
|
||||
ItemsSource="{Binding AvailableRecordDevices, Mode=TwoWay}"
|
||||
SelectedIndex="0"
|
||||
SelectedItem="{Binding SelectedDevice, Mode=TwoWay}" />
|
||||
<Button
|
||||
Width="91"
|
||||
Height="23"
|
||||
Margin="0,0,90,10"
|
||||
HorizontalAlignment="Right"
|
||||
VerticalAlignment="Bottom"
|
||||
Command="{Binding StartRecordingCommand}"
|
||||
Content="Record"
|
||||
IsEnabled="{Binding EnableStartRecord, Mode=OneWay}" />
|
||||
<Button
|
||||
Width="75"
|
||||
Height="23"
|
||||
Margin="0,0,10,10"
|
||||
HorizontalAlignment="Right"
|
||||
VerticalAlignment="Bottom"
|
||||
Command="{Binding StopRecordingCommand}"
|
||||
Content="Stop"
|
||||
IsEnabled="{Binding EnableStopRecord, Mode=OneWay}" />
|
||||
</Grid>
|
||||
</Window>
|
|
@ -0,0 +1,17 @@
|
|||
using CommonServiceLocator;
|
||||
using DeepSpeech.WPF.ViewModels;
|
||||
using System.Windows;
|
||||
|
||||
namespace DeepSpeechWPF
|
||||
{
|
||||
/// <summary>
|
||||
/// Interaction logic for MainWindow.xaml
|
||||
/// </summary>
|
||||
public partial class MainWindow : Window
|
||||
{
|
||||
public MainWindow() => InitializeComponent();
|
||||
|
||||
private void Window_Loaded(object sender, RoutedEventArgs e) =>
|
||||
DataContext = ServiceLocator.Current.GetInstance<MainWindowViewModel>();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
using System.Reflection;
|
||||
using System.Resources;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Windows;
|
||||
|
||||
// General Information about an assembly is controlled through the following
|
||||
// set of attributes. Change these attribute values to modify the information
|
||||
// associated with an assembly.
|
||||
[assembly: AssemblyTitle("DeepSpeech.WPF")]
|
||||
[assembly: AssemblyDescription("")]
|
||||
[assembly: AssemblyConfiguration("")]
|
||||
[assembly: AssemblyCompany("")]
|
||||
[assembly: AssemblyProduct("DeepSpeech.WPF.SingleFiles")]
|
||||
[assembly: AssemblyCopyright("Copyright © 2018")]
|
||||
[assembly: AssemblyTrademark("")]
|
||||
[assembly: AssemblyCulture("")]
|
||||
|
||||
// Setting ComVisible to false makes the types in this assembly not visible
|
||||
// to COM components. If you need to access a type in this assembly from
|
||||
// COM, set the ComVisible attribute to true on that type.
|
||||
[assembly: ComVisible(false)]
|
||||
|
||||
//In order to begin building localizable applications, set
|
||||
//<UICulture>CultureYouAreCodingWith</UICulture> in your .csproj file
|
||||
//inside a <PropertyGroup>. For example, if you are using US english
|
||||
//in your source files, set the <UICulture> to en-US. Then uncomment
|
||||
//the NeutralResourceLanguage attribute below. Update the "en-US" in
|
||||
//the line below to match the UICulture setting in the project file.
|
||||
|
||||
//[assembly: NeutralResourcesLanguage("en-US", UltimateResourceFallbackLocation.Satellite)]
|
||||
|
||||
|
||||
[assembly: ThemeInfo(
|
||||
ResourceDictionaryLocation.None, //where theme specific resource dictionaries are located
|
||||
//(used if a resource is not found in the page,
|
||||
// or application resource dictionaries)
|
||||
ResourceDictionaryLocation.SourceAssembly //where the generic resource dictionary is located
|
||||
//(used if a resource is not found in the page,
|
||||
// app, or any theme specific resource dictionaries)
|
||||
)]
|
||||
|
||||
|
||||
// Version information for an assembly consists of the following four values:
|
||||
//
|
||||
// Major Version
|
||||
// Minor Version
|
||||
// Build Number
|
||||
// Revision
|
||||
//
|
||||
// You can specify all the values or you can default the Build and Revision Numbers
|
||||
// by using the '*' as shown below:
|
||||
// [assembly: AssemblyVersion("1.0.*")]
|
||||
[assembly: AssemblyVersion("1.0.0.0")]
|
||||
[assembly: AssemblyFileVersion("1.0.0.0")]
|
|
@ -0,0 +1,63 @@
|
|||
//------------------------------------------------------------------------------
|
||||
// <auto-generated>
|
||||
// This code was generated by a tool.
|
||||
// Runtime Version:4.0.30319.42000
|
||||
//
|
||||
// Changes to this file may cause incorrect behavior and will be lost if
|
||||
// the code is regenerated.
|
||||
// </auto-generated>
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
namespace DeepSpeech.WPF.Properties {
|
||||
using System;
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// A strongly-typed resource class, for looking up localized strings, etc.
|
||||
/// </summary>
|
||||
// This class was auto-generated by the StronglyTypedResourceBuilder
|
||||
// class via a tool like ResGen or Visual Studio.
|
||||
// To add or remove a member, edit your .ResX file then rerun ResGen
|
||||
// with the /str option, or rebuild your VS project.
|
||||
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "15.0.0.0")]
|
||||
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
|
||||
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
|
||||
internal class Resources {
|
||||
|
||||
private static global::System.Resources.ResourceManager resourceMan;
|
||||
|
||||
private static global::System.Globalization.CultureInfo resourceCulture;
|
||||
|
||||
[global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")]
|
||||
internal Resources() {
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the cached ResourceManager instance used by this class.
|
||||
/// </summary>
|
||||
[global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
|
||||
internal static global::System.Resources.ResourceManager ResourceManager {
|
||||
get {
|
||||
if (object.ReferenceEquals(resourceMan, null)) {
|
||||
global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("DeepSpeech.WPF.Properties.Resources", typeof(Resources).Assembly);
|
||||
resourceMan = temp;
|
||||
}
|
||||
return resourceMan;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Overrides the current thread's CurrentUICulture property for all
|
||||
/// resource lookups using this strongly typed resource class.
|
||||
/// </summary>
|
||||
[global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
|
||||
internal static global::System.Globalization.CultureInfo Culture {
|
||||
get {
|
||||
return resourceCulture;
|
||||
}
|
||||
set {
|
||||
resourceCulture = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<root>
|
||||
<!--
|
||||
Microsoft ResX Schema
|
||||
|
||||
Version 2.0
|
||||
|
||||
The primary goals of this format is to allow a simple XML format
|
||||
that is mostly human readable. The generation and parsing of the
|
||||
various data types are done through the TypeConverter classes
|
||||
associated with the data types.
|
||||
|
||||
Example:
|
||||
|
||||
... ado.net/XML headers & schema ...
|
||||
<resheader name="resmimetype">text/microsoft-resx</resheader>
|
||||
<resheader name="version">2.0</resheader>
|
||||
<resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
|
||||
<resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
|
||||
<data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
|
||||
<data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
|
||||
<data name="Bitmap1" mimetype="application/x-microsoft.net.object.binary.base64">
|
||||
<value>[base64 mime encoded serialized .NET Framework object]</value>
|
||||
</data>
|
||||
<data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/x-microsoft.net.object.bytearray.base64">
|
||||
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
|
||||
<comment>This is a comment</comment>
|
||||
</data>
|
||||
|
||||
There are any number of "resheader" rows that contain simple
|
||||
name/value pairs.
|
||||
|
||||
Each data row contains a name, and value. The row also contains a
|
||||
type or mimetype. Type corresponds to a .NET class that support
|
||||
text/value conversion through the TypeConverter architecture.
|
||||
Classes that don't support this are serialized and stored with the
|
||||
mimetype set.
|
||||
|
||||
The mimetype is used for serialized objects, and tells the
|
||||
ResXResourceReader how to depersist the object. This is currently not
|
||||
extensible. For a given mimetype the value must be set accordingly:
|
||||
|
||||
Note - application/x-microsoft.net.object.binary.base64 is the format
|
||||
that the ResXResourceWriter will generate, however the reader can
|
||||
read any of the formats listed below.
|
||||
|
||||
mimetype: application/x-microsoft.net.object.binary.base64
|
||||
value : The object must be serialized with
|
||||
: System.Serialization.Formatters.Binary.BinaryFormatter
|
||||
: and then encoded with base64 encoding.
|
||||
|
||||
mimetype: application/x-microsoft.net.object.soap.base64
|
||||
value : The object must be serialized with
|
||||
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
|
||||
: and then encoded with base64 encoding.
|
||||
|
||||
mimetype: application/x-microsoft.net.object.bytearray.base64
|
||||
value : The object must be serialized into a byte array
|
||||
: using a System.ComponentModel.TypeConverter
|
||||
: and then encoded with base64 encoding.
|
||||
-->
|
||||
<xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
|
||||
<xsd:element name="root" msdata:IsDataSet="true">
|
||||
<xsd:complexType>
|
||||
<xsd:choice maxOccurs="unbounded">
|
||||
<xsd:element name="metadata">
|
||||
<xsd:complexType>
|
||||
<xsd:sequence>
|
||||
<xsd:element name="value" type="xsd:string" minOccurs="0" />
|
||||
</xsd:sequence>
|
||||
<xsd:attribute name="name" type="xsd:string" />
|
||||
<xsd:attribute name="type" type="xsd:string" />
|
||||
<xsd:attribute name="mimetype" type="xsd:string" />
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
<xsd:element name="assembly">
|
||||
<xsd:complexType>
|
||||
<xsd:attribute name="alias" type="xsd:string" />
|
||||
<xsd:attribute name="name" type="xsd:string" />
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
<xsd:element name="data">
|
||||
<xsd:complexType>
|
||||
<xsd:sequence>
|
||||
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
|
||||
<xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
|
||||
</xsd:sequence>
|
||||
<xsd:attribute name="name" type="xsd:string" msdata:Ordinal="1" />
|
||||
<xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
|
||||
<xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
<xsd:element name="resheader">
|
||||
<xsd:complexType>
|
||||
<xsd:sequence>
|
||||
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
|
||||
</xsd:sequence>
|
||||
<xsd:attribute name="name" type="xsd:string" use="required" />
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
</xsd:choice>
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
</xsd:schema>
|
||||
<resheader name="resmimetype">
|
||||
<value>text/microsoft-resx</value>
|
||||
</resheader>
|
||||
<resheader name="version">
|
||||
<value>2.0</value>
|
||||
</resheader>
|
||||
<resheader name="reader">
|
||||
<value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</resheader>
|
||||
<resheader name="writer">
|
||||
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</resheader>
|
||||
</root>
|
|
@ -0,0 +1,26 @@
|
|||
//------------------------------------------------------------------------------
|
||||
// <auto-generated>
|
||||
// This code was generated by a tool.
|
||||
// Runtime Version:4.0.30319.42000
|
||||
//
|
||||
// Changes to this file may cause incorrect behavior and will be lost if
|
||||
// the code is regenerated.
|
||||
// </auto-generated>
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
namespace DeepSpeech.WPF.Properties {
|
||||
|
||||
|
||||
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
|
||||
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "15.9.0.0")]
|
||||
internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase {
|
||||
|
||||
private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings())));
|
||||
|
||||
public static Settings Default {
|
||||
get {
|
||||
return defaultInstance;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version='1.0' encoding='utf-8'?>
|
||||
<SettingsFile xmlns="uri:settings" CurrentProfile="(Default)">
|
||||
<Profiles>
|
||||
<Profile Name="(Default)" />
|
||||
</Profiles>
|
||||
<Settings />
|
||||
</SettingsFile>
|
|
@ -0,0 +1,49 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.ComponentModel;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
namespace DeepSpeech.WPF.ViewModels
|
||||
{
|
||||
/// <summary>
|
||||
/// Implementation of <see cref="INotifyPropertyChanged"/> to simplify models.
|
||||
/// </summary>
|
||||
public abstract class BindableBase : INotifyPropertyChanged
|
||||
{
|
||||
/// <summary>
|
||||
/// Checks if a property already matches a desired value. Sets the property and
|
||||
/// notifies listeners only when necessary.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">Type of the property.</typeparam>
|
||||
/// <param name="storage">Reference to a property with both getter and setter.</param>
|
||||
/// <param name="value">Desired value for the property.</param>
|
||||
/// <param name="propertyName">Name of the property used to notify listeners. This
|
||||
/// value is optional and can be provided automatically when invoked from compilers that
|
||||
/// support CallerMemberName.</param>
|
||||
/// <returns>True if the value was changed, false if the existing value matched the
|
||||
/// desired value.</returns>
|
||||
protected bool SetProperty<T>(ref T backingStore, T value,
|
||||
[CallerMemberName]string propertyName = "",
|
||||
Action onChanged = null)
|
||||
{
|
||||
if (EqualityComparer<T>.Default.Equals(backingStore, value))
|
||||
return false;
|
||||
backingStore = value;
|
||||
onChanged?.Invoke();
|
||||
OnPropertyChanged(propertyName);
|
||||
return true;
|
||||
}
|
||||
|
||||
#region INotifyPropertyChanged
|
||||
/// <summary>
|
||||
/// Notifies listeners that a property value has changed.
|
||||
/// </summary>
|
||||
/// <param name="propertyName">Name of the property used to notify listeners. This
|
||||
/// value is optional and can be provided automatically when invoked from compilers
|
||||
/// that support <see cref="CallerMemberNameAttribute"/>.</param>
|
||||
public event PropertyChangedEventHandler PropertyChanged;
|
||||
protected void OnPropertyChanged([CallerMemberName] string propertyName = "")
|
||||
=> PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(propertyName));
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -0,0 +1,422 @@
|
|||
using AsyncAwaitBestPractices.MVVM;
|
||||
using CSCore;
|
||||
using CSCore.CoreAudioAPI;
|
||||
using CSCore.SoundIn;
|
||||
using CSCore.Streams;
|
||||
using DeepSpeechClient.Interfaces;
|
||||
using GalaSoft.MvvmLight.CommandWpf;
|
||||
using Microsoft.Win32;
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.ObjectModel;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace DeepSpeech.WPF.ViewModels
|
||||
{
|
||||
/// <summary>
|
||||
/// View model of the MainWindow View.
|
||||
/// </summary>
|
||||
public class MainWindowViewModel : BindableBase
|
||||
{
|
||||
#region Constants
|
||||
private const int SampleRate = 16000;
|
||||
private const string LMPath = "lm.binary";
|
||||
private const string TriePath = "trie";
|
||||
#endregion
|
||||
|
||||
private readonly IDeepSpeech _sttClient;
|
||||
|
||||
#region Commands
|
||||
/// <summary>
|
||||
/// Gets or sets the command that enables the language model.
|
||||
/// </summary>
|
||||
public IAsyncCommand EnableLanguageModelCommand { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the command that runs inference using an audio file.
|
||||
/// </summary>
|
||||
public IAsyncCommand InferenceFromFileCommand { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the command that opens a dialog to select an audio file.
|
||||
/// </summary>
|
||||
public RelayCommand SelectFileCommand { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the command that starts to record.
|
||||
/// </summary>
|
||||
public RelayCommand StartRecordingCommand { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the command that stops the recording and gets the result.
|
||||
/// </summary>
|
||||
public IAsyncCommand StopRecordingCommand { get; private set; }
|
||||
|
||||
#endregion
|
||||
|
||||
#region Streaming
|
||||
/// <summary>
|
||||
/// Records the audio of the selected device.
|
||||
/// </summary>
|
||||
private WasapiCapture _audioCapture;
|
||||
|
||||
/// <summary>
|
||||
/// Converts the device source into a wavesource.
|
||||
/// </summary>
|
||||
private SoundInSource _soundInSource;
|
||||
|
||||
/// <summary>
|
||||
/// Target wave source.(16KHz Mono 16bit for DeepSpeech)
|
||||
/// </summary>
|
||||
private IWaveSource _convertedSource;
|
||||
|
||||
/// <summary>
|
||||
/// Queue that prevents feeding data to the inference engine if it is busy.
|
||||
/// </summary>
|
||||
private ConcurrentQueue<short[]> _bufferQueue = new ConcurrentQueue<short[]>();
|
||||
|
||||
private int _threadSafeBoolBackValue = 0;
|
||||
|
||||
/// <summary>
|
||||
/// Lock to process items in the queue one at time.
|
||||
/// </summary>
|
||||
public bool StreamingIsBusy
|
||||
{
|
||||
get => (Interlocked.CompareExchange(ref _threadSafeBoolBackValue, 1, 1) == 1);
|
||||
set
|
||||
{
|
||||
if (value) Interlocked.CompareExchange(ref _threadSafeBoolBackValue, 1, 0);
|
||||
else Interlocked.CompareExchange(ref _threadSafeBoolBackValue, 0, 1);
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region ViewProperties
|
||||
|
||||
private bool _enableStartRecord;
|
||||
/// <summary>
|
||||
/// Gets or sets record status to control the record command.
|
||||
/// </summary>
|
||||
public bool EnableStartRecord
|
||||
{
|
||||
get => _enableStartRecord;
|
||||
set => SetProperty(ref _enableStartRecord, value);
|
||||
}
|
||||
|
||||
private bool _stopRecordStopRecord;
|
||||
/// <summary>
|
||||
/// Gets or sets record status to control stop command.
|
||||
/// </summary>
|
||||
public bool EnableStopRecord
|
||||
{
|
||||
get => _stopRecordStopRecord;
|
||||
set => SetProperty(ref _stopRecordStopRecord, value,
|
||||
onChanged: ()=> ((AsyncCommand)StopRecordingCommand).RaiseCanExecuteChanged());
|
||||
}
|
||||
|
||||
private MMDevice _selectedDevice;
|
||||
/// <summary>
|
||||
/// Gets or sets the selected recording device.
|
||||
/// </summary>
|
||||
public MMDevice SelectedDevice
|
||||
{
|
||||
get => _selectedDevice;
|
||||
set => SetProperty(ref _selectedDevice, value,
|
||||
onChanged: UpdateSelectedDevice);
|
||||
}
|
||||
|
||||
private string _statusMessage;
|
||||
/// <summary>
|
||||
/// Gets or sets status message.
|
||||
/// </summary>
|
||||
public string StatusMessage
|
||||
{
|
||||
get => _statusMessage;
|
||||
set => SetProperty(ref _statusMessage, value);
|
||||
}
|
||||
|
||||
private bool _languageModelEnabled;
|
||||
/// <summary>
|
||||
/// Gets or sets the language model status.
|
||||
/// </summary>
|
||||
private bool LanguageModelEnabled
|
||||
{
|
||||
get => _languageModelEnabled;
|
||||
set => SetProperty(ref _languageModelEnabled, value,
|
||||
onChanged: () => ((AsyncCommand)EnableLanguageModelCommand).RaiseCanExecuteChanged());
|
||||
}
|
||||
|
||||
private bool _isRunningInference;
|
||||
/// <summary>
|
||||
/// Gets or sets whenever the model is running inference.
|
||||
/// </summary>
|
||||
private bool IsRunningInference
|
||||
{
|
||||
get => _isRunningInference;
|
||||
set => SetProperty(ref _isRunningInference, value,
|
||||
onChanged: () => ((AsyncCommand)InferenceFromFileCommand).RaiseCanExecuteChanged());
|
||||
}
|
||||
|
||||
private string _transcription;
|
||||
/// <summary>
|
||||
/// Gets or sets the current transcription.
|
||||
/// </summary>
|
||||
public string Transcription
|
||||
{
|
||||
get => _transcription;
|
||||
set => SetProperty(ref _transcription, value);
|
||||
}
|
||||
|
||||
private string _audioFilePaht;
|
||||
/// <summary>
|
||||
/// Gets or sets the selected audio file path.
|
||||
/// </summary>
|
||||
public string AudioFilePath
|
||||
{
|
||||
get => _audioFilePaht;
|
||||
set => SetProperty(ref _audioFilePaht, value);
|
||||
}
|
||||
|
||||
private ObservableCollection<MMDevice> _deviceNames;
|
||||
/// <summary>
|
||||
/// Gets or sets the available recording devices.
|
||||
/// </summary>
|
||||
public ObservableCollection<MMDevice> AvailableRecordDevices
|
||||
{
|
||||
get => _deviceNames;
|
||||
set => SetProperty(ref _deviceNames, value);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Ctors
|
||||
public MainWindowViewModel(IDeepSpeech sttClient)
|
||||
{
|
||||
_sttClient = sttClient;
|
||||
|
||||
EnableLanguageModelCommand = new AsyncCommand(()=>EnableLanguageModelAsync(LMPath,TriePath),
|
||||
_ => !LanguageModelEnabled);
|
||||
|
||||
InferenceFromFileCommand = new AsyncCommand(ExecuteInferenceFromFileAsync,
|
||||
_ => !IsRunningInference);
|
||||
|
||||
SelectFileCommand = new RelayCommand(SelectAudioFile);
|
||||
|
||||
StartRecordingCommand = new RelayCommand(StartRecording,
|
||||
canExecute: CanExecuteStartRecording);
|
||||
|
||||
StopRecordingCommand = new AsyncCommand(StopRecordingAsync,
|
||||
_ => EnableStopRecord);
|
||||
|
||||
LoadAvailableCaptureDevices();
|
||||
}
|
||||
#endregion
|
||||
|
||||
/// <summary>
|
||||
/// Releases the current capture device and initializes the selected one.
|
||||
/// </summary>
|
||||
private void UpdateSelectedDevice()
|
||||
{
|
||||
ReleaseCapture();
|
||||
InitializeAudioCapture();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Releases the capture device.
|
||||
/// </summary>
|
||||
private void ReleaseCapture()
|
||||
{
|
||||
if (_audioCapture != null)
|
||||
{
|
||||
_audioCapture.DataAvailable -= Capture_DataAvailable;
|
||||
_audioCapture.Dispose();
|
||||
}
|
||||
}
|
||||
/// <summary>
|
||||
/// Command usage to know when the recording can start.
|
||||
/// </summary>
|
||||
/// <returns>If the device is not null.</returns>
|
||||
private bool CanExecuteStartRecording() =>
|
||||
SelectedDevice != null;
|
||||
|
||||
/// <summary>
|
||||
/// Loads all the available audio capture devices.
|
||||
/// </summary>
|
||||
private void LoadAvailableCaptureDevices()
|
||||
{
|
||||
AvailableRecordDevices = new ObservableCollection<MMDevice>(
|
||||
MMDeviceEnumerator.EnumerateDevices(DataFlow.All, DeviceState.Active)); //we get only enabled devices
|
||||
EnableStartRecord = true;
|
||||
if (AvailableRecordDevices?.Count != 0)
|
||||
SelectedDevice = AvailableRecordDevices[0];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the capture source.
|
||||
/// </summary>
|
||||
private void InitializeAudioCapture()
|
||||
{
|
||||
if (SelectedDevice != null)
|
||||
{
|
||||
_audioCapture = SelectedDevice.DataFlow == DataFlow.Capture ?
|
||||
new WasapiCapture() : new WasapiLoopbackCapture();
|
||||
_audioCapture.Device = SelectedDevice;
|
||||
_audioCapture.Initialize();
|
||||
_audioCapture.DataAvailable += Capture_DataAvailable;
|
||||
_soundInSource = new SoundInSource(_audioCapture) { FillWithZeros = false };
|
||||
//create a source, that converts the data provided by the
|
||||
//soundInSource to required format
|
||||
_convertedSource = _soundInSource
|
||||
.ChangeSampleRate(SampleRate) // sample rate
|
||||
.ToSampleSource()
|
||||
.ToWaveSource(16); //bits per sample
|
||||
|
||||
_convertedSource = _convertedSource.ToMono();
|
||||
}
|
||||
}
|
||||
|
||||
private void Capture_DataAvailable(object sender, DataAvailableEventArgs e)
|
||||
{
|
||||
//read data from the converedSource
|
||||
//important: don't use the e.Data here
|
||||
//the e.Data contains the raw data provided by the
|
||||
//soundInSource which won't have the deepspeech required audio format
|
||||
byte[] buffer = new byte[_convertedSource.WaveFormat.BytesPerSecond / 2];
|
||||
|
||||
int read;
|
||||
//keep reading as long as we still get some data
|
||||
while ((read = _convertedSource.Read(buffer, 0, buffer.Length)) > 0)
|
||||
{
|
||||
short[] sdata = new short[(int)Math.Ceiling(Convert.ToDecimal(read / 2))];
|
||||
Buffer.BlockCopy(buffer, 0, sdata, 0, read);
|
||||
_bufferQueue.Enqueue(sdata);
|
||||
Task.Run(() => OnNewData());
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts processing data from the queue.
|
||||
/// </summary>
|
||||
private void OnNewData()
|
||||
{
|
||||
while (!StreamingIsBusy && !_bufferQueue.IsEmpty)
|
||||
{
|
||||
if (_bufferQueue.TryDequeue(out short[] buffer))
|
||||
{
|
||||
StreamingIsBusy = true;
|
||||
_sttClient.FeedAudioContent(buffer, Convert.ToUInt32(buffer.Length));
|
||||
StreamingIsBusy = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Enables the language model.
|
||||
/// </summary>
|
||||
/// <param name="lmPath">Language model path.</param>
|
||||
/// <param name="triePath">Trie path.</param>
|
||||
/// <returns>A Task to await.</returns>
|
||||
public async Task EnableLanguageModelAsync(string lmPath, string triePath)
|
||||
{
|
||||
try
|
||||
{
|
||||
StatusMessage = "Loading language model...";
|
||||
const float LM_ALPHA = 0.75f;
|
||||
const float LM_BETA = 1.85f;
|
||||
await Task.Run(() => _sttClient.EnableDecoderWithLM(LMPath, TriePath, LM_ALPHA, LM_BETA));
|
||||
LanguageModelEnabled = true;
|
||||
StatusMessage = "Language model loaded.";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
StatusMessage = ex.Message;
|
||||
}
|
||||
}
|
||||
/// <summary>
|
||||
/// Runs inference and sets the transcription of an audio file.
|
||||
/// </summary>
|
||||
/// <returns>A Task to await.</returns>
|
||||
public async Task ExecuteInferenceFromFileAsync()
|
||||
{
|
||||
try
|
||||
{
|
||||
IsRunningInference = true;
|
||||
Transcription = string.Empty;
|
||||
StatusMessage = "Running inference...";
|
||||
Stopwatch watch = new Stopwatch();
|
||||
var waveBuffer = new NAudio.Wave.WaveBuffer(File.ReadAllBytes(AudioFilePath));
|
||||
using (var waveInfo = new NAudio.Wave.WaveFileReader(AudioFilePath))
|
||||
{
|
||||
watch.Start();
|
||||
string speechResult = await Task.Run(() => _sttClient.SpeechToText(
|
||||
waveBuffer.ShortBuffer,
|
||||
Convert.ToUInt32(waveBuffer.MaxSize / 2)));
|
||||
|
||||
watch.Stop();
|
||||
Transcription = $"Audio duration: {waveInfo.TotalTime.ToString()} {Environment.NewLine}" +
|
||||
$"Inference took: {watch.Elapsed.ToString()} {Environment.NewLine}" +
|
||||
$"Recognized text: {speechResult}";
|
||||
}
|
||||
waveBuffer.Clear();
|
||||
StatusMessage = string.Empty;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
StatusMessage = ex.Message;
|
||||
}
|
||||
finally
|
||||
{
|
||||
IsRunningInference = false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stops the recording and sets the transcription of the closed stream.
|
||||
/// </summary>
|
||||
/// <returns>A Task to await.</returns>
|
||||
private async Task StopRecordingAsync()
|
||||
{
|
||||
EnableStopRecord = false;
|
||||
_audioCapture.Stop();
|
||||
while (!_bufferQueue.IsEmpty && StreamingIsBusy) //we wait for all the queued buffers to be processed
|
||||
{
|
||||
await Task.Delay(90);
|
||||
}
|
||||
Transcription = _sttClient.FinishStream();
|
||||
EnableStartRecord = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new stream and starts the recording.
|
||||
/// </summary>
|
||||
private void StartRecording()
|
||||
{
|
||||
_sttClient.CreateStream();
|
||||
_audioCapture.Start();
|
||||
EnableStartRecord = false;
|
||||
EnableStopRecord = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Opens a dialog to select an audio file.
|
||||
/// </summary>
|
||||
private void SelectAudioFile()
|
||||
{
|
||||
OpenFileDialog dialog = new OpenFileDialog
|
||||
{
|
||||
Filter = "wav Files |*.wav",
|
||||
Multiselect = false,
|
||||
Title = "Please select a wav file."
|
||||
};
|
||||
|
||||
if ((bool)dialog.ShowDialog())
|
||||
{
|
||||
AudioFilePath = dialog.FileName;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="AsyncAwaitBestPractices" version="3.1.0" targetFramework="net462" />
|
||||
<package id="AsyncAwaitBestPractices.MVVM" version="3.1.0" targetFramework="net462" />
|
||||
<package id="CommonServiceLocator" version="2.0.2" targetFramework="net462" />
|
||||
<package id="CSCore" version="1.2.1.2" targetFramework="net462" />
|
||||
<package id="MvvmLightLibs" version="5.4.1.1" targetFramework="net462" />
|
||||
<package id="NAudio" version="1.9.0" targetFramework="net462" />
|
||||
</packages>
|
|
@ -0,0 +1,58 @@
|
|||
# NodeJS voice recognition example using Mozilla DeepSpeech
|
||||
|
||||
Download the pre-trained model (1.8GB):
|
||||
|
||||
```
|
||||
wget https://github.com/mozilla/DeepSpeech/releases/download/v0.6.0/deepspeech-0.6.0-models.tar.gz
|
||||
tar xvfz deepspeech-0.6.0-models.tar.gz
|
||||
```
|
||||
|
||||
Edit references to models path if necessary:
|
||||
|
||||
```
|
||||
let modelPath = './models/output_graph.pbmm';
|
||||
let lmPath = './models/lm.binary';
|
||||
let triePath = './models/trie';
|
||||
```
|
||||
|
||||
Install Sox (for .wav file loading):
|
||||
|
||||
```
|
||||
brew install sox
|
||||
```
|
||||
|
||||
Download test audio files:
|
||||
|
||||
```
|
||||
wget https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz
|
||||
tar xfvz audio-0.4.1.tar.gz
|
||||
```
|
||||
|
||||
Install NPM dependencies:
|
||||
|
||||
```
|
||||
npm install
|
||||
```
|
||||
|
||||
Run:
|
||||
|
||||
```
|
||||
node index.js
|
||||
```
|
||||
|
||||
Result should be something like:
|
||||
|
||||
```
|
||||
audio length 1.975
|
||||
result: experience proves this
|
||||
|
||||
```
|
||||
|
||||
Try other wav files with an argument:
|
||||
|
||||
```
|
||||
node index.js audio/2830-3980-0043.wav
|
||||
node index.js audio/8455-210777-0068.wav
|
||||
node index.js audio/4507-16021-0012.wav
|
||||
```
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
const DeepSpeech = require('deepspeech');
|
||||
const Fs = require('fs');
|
||||
const Sox = require('sox-stream');
|
||||
const MemoryStream = require('memory-stream');
|
||||
const Duplex = require('stream').Duplex;
|
||||
const Wav = require('node-wav');
|
||||
|
||||
const BEAM_WIDTH = 1024;
|
||||
let modelPath = './models/output_graph.pbmm';
|
||||
|
||||
let model = new DeepSpeech.Model(modelPath, BEAM_WIDTH);
|
||||
|
||||
let desiredSampleRate = model.sampleRate();
|
||||
|
||||
const LM_ALPHA = 0.75;
|
||||
const LM_BETA = 1.85;
|
||||
let lmPath = './models/lm.binary';
|
||||
let triePath = './models/trie';
|
||||
|
||||
model.enableDecoderWithLM(lmPath, triePath, LM_ALPHA, LM_BETA);
|
||||
|
||||
let audioFile = process.argv[2] || './audio/2830-3980-0043.wav';
|
||||
|
||||
if (!Fs.existsSync(audioFile)) {
|
||||
console.log('file missing:', audioFile);
|
||||
process.exit();
|
||||
}
|
||||
|
||||
const buffer = Fs.readFileSync(audioFile);
|
||||
const result = Wav.decode(buffer);
|
||||
|
||||
if (result.sampleRate < desiredSampleRate) {
|
||||
console.error('Warning: original sample rate (' + result.sampleRate + ') is lower than ' + desiredSampleRate + 'Hz. Up-sampling might produce erratic speech recognition.');
|
||||
}
|
||||
|
||||
function bufferToStream(buffer) {
|
||||
let stream = new Duplex();
|
||||
stream.push(buffer);
|
||||
stream.push(null);
|
||||
return stream;
|
||||
}
|
||||
|
||||
let audioStream = new MemoryStream();
|
||||
bufferToStream(buffer).
|
||||
pipe(Sox({
|
||||
global: {
|
||||
'no-dither': true,
|
||||
},
|
||||
output: {
|
||||
bits: 16,
|
||||
rate: desiredSampleRate,
|
||||
channels: 1,
|
||||
encoding: 'signed-integer',
|
||||
endian: 'little',
|
||||
compression: 0.0,
|
||||
type: 'raw'
|
||||
}
|
||||
})).
|
||||
pipe(audioStream);
|
||||
|
||||
audioStream.on('finish', () => {
|
||||
let audioBuffer = audioStream.toBuffer();
|
||||
|
||||
const audioLength = (audioBuffer.length / 2) * (1 / desiredSampleRate);
|
||||
console.log('audio length', audioLength);
|
||||
|
||||
let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2));
|
||||
|
||||
console.log('result:', result);
|
||||
});
|
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
"name": "deepspeech-nodejs_wav",
|
||||
"version": "1.0.0",
|
||||
"description": "Simple audio processing",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"start": "node ./index.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"argparse": "^1.0.10",
|
||||
"deepspeech": "0.6.0",
|
||||
"node-wav": "0.0.2",
|
||||
"sox-stream": "^2.0.3",
|
||||
"util": "^0.11.1"
|
||||
},
|
||||
"license": "Public domain"
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -xe
|
||||
|
||||
THIS=$(dirname "$0")
|
||||
|
||||
pushd ${THIS}
|
||||
source ../tests.sh
|
||||
|
||||
npm install $(get_npm_package_url)
|
||||
npm install
|
||||
|
||||
ln -s $HOME/DeepSpeech/models models
|
||||
|
||||
node index.js $HOME/DeepSpeech/audio/2830-3980-0043.wav
|
||||
node index.js $HOME/DeepSpeech/audio/8455-210777-0068.wav
|
||||
node index.js $HOME/DeepSpeech/audio/4507-16021-0012.wav
|
||||
popd
|
|
@ -0,0 +1,23 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -xe
|
||||
|
||||
THIS=$(dirname "$0")
|
||||
|
||||
source ../../taskcluster/tc-tests-utils.sh
|
||||
|
||||
DEP_TASK_ID=$(curl -s https://community-tc.services.mozilla.com/api/queue/v1/task/${TASK_ID} | python -c 'import json; import sys; print(" ".join(json.loads(sys.stdin.read())["dependencies"]));')
|
||||
|
||||
get_python_wheel_url()
|
||||
{
|
||||
local this_python_version=$1
|
||||
|
||||
extract_python_versions "${this_python_version}" "pyver" "pyver_pkg" "py_unicode_type" "pyconf" "pyalias"
|
||||
|
||||
echo "$(get_python_pkg_url "${pyver_pkg}" "${py_unicode_type}" "deepspeech" https://community-tc.services.mozilla.com/api/queue/v1/task/${DEP_TASK_ID}/artifacts/public)"
|
||||
}
|
||||
|
||||
get_npm_package_url()
|
||||
{
|
||||
echo "https://community-tc.services.mozilla.com/api/queue/v1/task/${DEP_TASK_ID}/artifacts/public/deepspeech-${DS_VERSION}.tgz"
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
import sys
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
import subprocess
|
||||
import shlex
|
||||
import numpy as np
|
||||
import wavTranscriber
|
||||
|
||||
# Debug helpers
|
||||
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
|
||||
|
||||
|
||||
def main(args):
|
||||
parser = argparse.ArgumentParser(description='Transcribe long audio files using webRTC VAD or use the streaming interface')
|
||||
parser.add_argument('--aggressive', type=int, choices=range(4), required=False,
|
||||
help='Determines how aggressive filtering out non-speech is. (Interger between 0-3)')
|
||||
parser.add_argument('--audio', required=False,
|
||||
help='Path to the audio file to run (WAV format)')
|
||||
parser.add_argument('--model', required=True,
|
||||
help='Path to directory that contains all model files (output_graph, lm and trie)')
|
||||
parser.add_argument('--stream', required=False, action='store_true',
|
||||
help='To use deepspeech streaming interface')
|
||||
args = parser.parse_args()
|
||||
if args.stream is True:
|
||||
print("Opening mic for streaming")
|
||||
elif args.audio is not None:
|
||||
logging.debug("Transcribing audio file @ %s" % args.audio)
|
||||
else:
|
||||
parser.print_help()
|
||||
parser.exit()
|
||||
|
||||
# Point to a path containing the pre-trained models & resolve ~ if used
|
||||
dirName = os.path.expanduser(args.model)
|
||||
|
||||
# Resolve all the paths of model files
|
||||
output_graph, lm, trie = wavTranscriber.resolve_models(dirName)
|
||||
|
||||
# Load output_graph, alpahbet, lm and trie
|
||||
model_retval = wavTranscriber.load_model(output_graph, lm, trie)
|
||||
|
||||
if args.audio is not None:
|
||||
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']
|
||||
print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
|
||||
|
||||
inference_time = 0.0
|
||||
|
||||
# Run VAD on the input file
|
||||
waveFile = args.audio
|
||||
segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(waveFile, args.aggressive)
|
||||
f = open(waveFile.rstrip(".wav") + ".txt", 'w')
|
||||
logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt")
|
||||
|
||||
for i, segment in enumerate(segments):
|
||||
# Run deepspeech on the chunk that just completed VAD
|
||||
logging.debug("Processing chunk %002d" % (i,))
|
||||
audio = np.frombuffer(segment, dtype=np.int16)
|
||||
output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
|
||||
inference_time += output[1]
|
||||
logging.debug("Transcript: %s" % output[0])
|
||||
|
||||
f.write(output[0] + " ")
|
||||
|
||||
# Summary of the files processed
|
||||
f.close()
|
||||
|
||||
# Extract filename from the full file path
|
||||
filename, ext = os.path.split(os.path.basename(waveFile))
|
||||
logging.debug("************************************************************************************************************")
|
||||
logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
|
||||
logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
|
||||
logging.debug("************************************************************************************************************")
|
||||
print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
|
||||
else:
|
||||
sctx = model_retval[0].createStream()
|
||||
subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
|
||||
stdout=subprocess.PIPE,
|
||||
bufsize=0)
|
||||
print('You can start speaking now. Press Control-C to stop recording.')
|
||||
|
||||
try:
|
||||
while True:
|
||||
data = subproc.stdout.read(512)
|
||||
model_retval[0].feedAudioContent(sctx, np.frombuffer(data, np.int16))
|
||||
except KeyboardInterrupt:
|
||||
print('Transcription: ', model_retval[0].finishStream(sctx))
|
||||
subproc.terminate()
|
||||
subproc.wait()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:])
|
|
@ -0,0 +1,388 @@
|
|||
import sys
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import traceback
|
||||
import numpy as np
|
||||
import wavTranscriber
|
||||
from PyQt5.QtWidgets import *
|
||||
from PyQt5.QtGui import *
|
||||
from PyQt5.QtCore import *
|
||||
import shlex
|
||||
import subprocess
|
||||
|
||||
# Debug helpers
|
||||
logging.basicConfig(stream=sys.stderr,
|
||||
level=logging.DEBUG,
|
||||
format='%(filename)s - %(funcName)s@%(lineno)d %(name)s:%(levelname)s %(message)s')
|
||||
|
||||
|
||||
class WorkerSignals(QObject):
|
||||
'''
|
||||
Defines the signals available from a running worker thread.
|
||||
Supported signals are:
|
||||
|
||||
finished:
|
||||
No data
|
||||
|
||||
error
|
||||
'tuple' (ecxtype, value, traceback.format_exc())
|
||||
|
||||
result
|
||||
'object' data returned from processing, anything
|
||||
|
||||
progress
|
||||
'object' indicating the transcribed result
|
||||
'''
|
||||
|
||||
finished = pyqtSignal()
|
||||
error = pyqtSignal(tuple)
|
||||
result = pyqtSignal(object)
|
||||
progress = pyqtSignal(object)
|
||||
|
||||
|
||||
class Worker(QRunnable):
|
||||
'''
|
||||
Worker Thread
|
||||
|
||||
Inherits from QRunnable to handle worker thread setup, signals and wrap-up
|
||||
|
||||
@param callback:
|
||||
The funtion callback to run on this worker thread.
|
||||
Supplied args and kwargs will be passed through the runner.
|
||||
@type calllback: function
|
||||
@param args: Arguments to pass to the callback function
|
||||
@param kwargs: Keywords to pass to the callback function
|
||||
'''
|
||||
|
||||
def __init__(self, fn, *args, **kwargs):
|
||||
super(Worker, self).__init__()
|
||||
|
||||
# Store the conctructor arguments (re-used for processing)
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
self.signals = WorkerSignals()
|
||||
|
||||
# Add the callback to our kwargs
|
||||
self.kwargs['progress_callback'] = self.signals.progress
|
||||
|
||||
@pyqtSlot()
|
||||
def run(self):
|
||||
'''
|
||||
Initialise the runner function with the passed args, kwargs
|
||||
'''
|
||||
|
||||
# Retrieve args/kwargs here; and fire up the processing using them
|
||||
try:
|
||||
transcript = self.fn(*self.args, **self.kwargs)
|
||||
except:
|
||||
traceback.print_exc()
|
||||
exctype, value = sys.exc_info()[:2]
|
||||
self.signals.error.emit((exctype, value, traceback.format_exc()))
|
||||
else:
|
||||
# Return the result of the processing
|
||||
self.signals.result.emit(transcript)
|
||||
finally:
|
||||
# Done
|
||||
self.signals.finished.emit()
|
||||
|
||||
|
||||
class App(QMainWindow):
|
||||
dirName = ""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.title = 'Deepspeech Transcriber'
|
||||
self.left = 10
|
||||
self.top = 10
|
||||
self.width = 480
|
||||
self.height = 400
|
||||
self.initUI()
|
||||
|
||||
def initUI(self):
|
||||
self.setWindowTitle(self.title)
|
||||
self.setGeometry(self.left, self.top, self.width, self.height)
|
||||
layout = QGridLayout()
|
||||
layout.setSpacing(10)
|
||||
|
||||
self.microphone = QRadioButton("Microphone")
|
||||
self.fileUpload = QRadioButton("File Upload")
|
||||
self.browseBox = QLineEdit(self, placeholderText="Wave File, Mono @ 16 kHz, 16bit Little-Endian")
|
||||
self.modelsBox = QLineEdit(self, placeholderText="Directory path for output_graph, lm & trie")
|
||||
self.textboxTranscript = QPlainTextEdit(self, placeholderText="Transcription")
|
||||
self.browseButton = QPushButton('Browse', self)
|
||||
self.browseButton.setToolTip('Select a wav file')
|
||||
self.modelsButton = QPushButton('Browse', self)
|
||||
self.modelsButton.setToolTip('Select deepspeech models folder')
|
||||
self.transcribeWav = QPushButton('Transcribe Wav', self)
|
||||
self.transcribeWav.setToolTip('Start Wav Transcription')
|
||||
self.openMicrophone = QPushButton('Start Speaking', self)
|
||||
self.openMicrophone.setToolTip('Open Microphone')
|
||||
|
||||
layout.addWidget(self.microphone, 0, 1, 1, 2)
|
||||
layout.addWidget(self.fileUpload, 0, 3, 1, 2)
|
||||
layout.addWidget(self.browseBox, 1, 0, 1, 4)
|
||||
layout.addWidget(self.browseButton, 1, 4)
|
||||
layout.addWidget(self.modelsBox, 2, 0, 1, 4)
|
||||
layout.addWidget(self.modelsButton, 2, 4)
|
||||
layout.addWidget(self.transcribeWav, 3, 1, 1, 1)
|
||||
layout.addWidget(self.openMicrophone, 3, 3, 1, 1)
|
||||
layout.addWidget(self.textboxTranscript, 5, 0, -1, 0)
|
||||
|
||||
w = QWidget()
|
||||
w.setLayout(layout)
|
||||
|
||||
self.setCentralWidget(w)
|
||||
|
||||
# Microphone
|
||||
self.microphone.clicked.connect(self.mic_activate)
|
||||
|
||||
# File Upload
|
||||
self.fileUpload.clicked.connect(self.wav_activate)
|
||||
|
||||
# Connect Browse Button to Function on_click
|
||||
self.browseButton.clicked.connect(self.browse_on_click)
|
||||
|
||||
# Connect the Models Button
|
||||
self.modelsButton.clicked.connect(self.models_on_click)
|
||||
|
||||
# Connect Transcription button to threadpool
|
||||
self.transcribeWav.clicked.connect(self.transcriptionStart_on_click)
|
||||
|
||||
# Connect Microphone button to threadpool
|
||||
self.openMicrophone.clicked.connect(self.openMicrophone_on_click)
|
||||
self.openMicrophone.setCheckable(True)
|
||||
self.openMicrophone.toggle()
|
||||
|
||||
self.browseButton.setEnabled(False)
|
||||
self.browseBox.setEnabled(False)
|
||||
self.modelsBox.setEnabled(False)
|
||||
self.modelsButton.setEnabled(False)
|
||||
self.transcribeWav.setEnabled(False)
|
||||
self.openMicrophone.setEnabled(False)
|
||||
|
||||
self.show()
|
||||
|
||||
# Setup Threadpool
|
||||
self.threadpool = QThreadPool()
|
||||
logging.debug("Multithreading with maximum %d threads" % self.threadpool.maxThreadCount())
|
||||
|
||||
@pyqtSlot()
|
||||
def mic_activate(self):
|
||||
logging.debug("Enable streaming widgets")
|
||||
self.en_mic = True
|
||||
self.browseButton.setEnabled(False)
|
||||
self.browseBox.setEnabled(False)
|
||||
self.modelsBox.setEnabled(True)
|
||||
self.modelsButton.setEnabled(True)
|
||||
self.transcribeWav.setEnabled(False)
|
||||
self.openMicrophone.setStyleSheet('QPushButton {background-color: #70cc7c; color: black;}')
|
||||
self.openMicrophone.setEnabled(True)
|
||||
|
||||
@pyqtSlot()
|
||||
def wav_activate(self):
|
||||
logging.debug("Enable wav transcription widgets")
|
||||
self.en_mic = False
|
||||
self.openMicrophone.setStyleSheet('QPushButton {background-color: #f7f7f7; color: black;}')
|
||||
self.openMicrophone.setEnabled(False)
|
||||
self.browseButton.setEnabled(True)
|
||||
self.browseBox.setEnabled(True)
|
||||
self.modelsBox.setEnabled(True)
|
||||
self.modelsButton.setEnabled(True)
|
||||
|
||||
@pyqtSlot()
|
||||
def browse_on_click(self):
|
||||
logging.debug('Browse button clicked')
|
||||
options = QFileDialog.Options()
|
||||
options |= QFileDialog.DontUseNativeDialog
|
||||
self.fileName, _ = QFileDialog.getOpenFileName(self, "Select wav file to be Transcribed", "","All Files (*.wav)")
|
||||
if self.fileName:
|
||||
self.browseBox.setText(self.fileName)
|
||||
self.transcribeWav.setEnabled(True)
|
||||
logging.debug(self.fileName)
|
||||
|
||||
@pyqtSlot()
|
||||
def models_on_click(self):
|
||||
logging.debug('Models Browse Button clicked')
|
||||
self.dirName = QFileDialog.getExistingDirectory(self, "Select deepspeech models directory")
|
||||
if self.dirName:
|
||||
self.modelsBox.setText(self.dirName)
|
||||
logging.debug(self.dirName)
|
||||
|
||||
# Threaded signal passing worker functions
|
||||
worker = Worker(self.modelWorker, self.dirName)
|
||||
worker.signals.result.connect(self.modelResult)
|
||||
worker.signals.finished.connect(self.modelFinish)
|
||||
worker.signals.progress.connect(self.modelProgress)
|
||||
|
||||
# Execute
|
||||
self.threadpool.start(worker)
|
||||
else:
|
||||
logging.critical("*****************************************************")
|
||||
logging.critical("Model path not specified..")
|
||||
logging.critical("*****************************************************")
|
||||
return "Transcription Failed, models path not specified"
|
||||
|
||||
def modelWorker(self, dirName, progress_callback):
|
||||
self.textboxTranscript.setPlainText("Loading Models...")
|
||||
self.openMicrophone.setStyleSheet('QPushButton {background-color: #f7f7f7; color: black;}')
|
||||
self.openMicrophone.setEnabled(False)
|
||||
self.show()
|
||||
time.sleep(1)
|
||||
return dirName
|
||||
|
||||
def modelProgress(self, s):
|
||||
# FixMe: Write code to show progress here
|
||||
pass
|
||||
|
||||
def modelResult(self, dirName):
|
||||
# Fetch and Resolve all the paths of model files
|
||||
output_graph, lm, trie = wavTranscriber.resolve_models(dirName)
|
||||
# Load output_graph, alpahbet, lm and trie
|
||||
self.model = wavTranscriber.load_model(output_graph, lm, trie)
|
||||
|
||||
def modelFinish(self):
|
||||
# self.timer.stop()
|
||||
self.textboxTranscript.setPlainText("Loaded Models, start transcribing")
|
||||
if self.en_mic is True:
|
||||
self.openMicrophone.setStyleSheet('QPushButton {background-color: #70cc7c; color: black;}')
|
||||
self.openMicrophone.setEnabled(True)
|
||||
self.show()
|
||||
|
||||
@pyqtSlot()
|
||||
def transcriptionStart_on_click(self):
|
||||
logging.debug('Transcription Start button clicked')
|
||||
|
||||
# Clear out older data
|
||||
self.textboxTranscript.setPlainText("")
|
||||
self.show()
|
||||
|
||||
# Threaded signal passing worker functions
|
||||
worker = Worker(self.wavWorker, self.fileName)
|
||||
worker.signals.progress.connect(self.progress)
|
||||
worker.signals.result.connect(self.transcription)
|
||||
worker.signals.finished.connect(self.wavFinish)
|
||||
|
||||
# Execute
|
||||
self.threadpool.start(worker)
|
||||
|
||||
@pyqtSlot()
|
||||
def openMicrophone_on_click(self):
|
||||
logging.debug('Preparing to open microphone...')
|
||||
|
||||
# Clear out older data
|
||||
self.textboxTranscript.setPlainText("")
|
||||
self.show()
|
||||
|
||||
# Threaded signal passing worker functions
|
||||
# Prepare env for capturing from microphone and offload work to micWorker worker thread
|
||||
if (not self.openMicrophone.isChecked()):
|
||||
self.openMicrophone.setStyleSheet('QPushButton {background-color: #C60000; color: black;}')
|
||||
self.openMicrophone.setText("Stop")
|
||||
logging.debug("Start Recording pressed")
|
||||
logging.debug("Preparing for transcription...")
|
||||
|
||||
sctx = self.model[0].createStream()
|
||||
subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
|
||||
stdout=subprocess.PIPE,
|
||||
bufsize=0)
|
||||
self.textboxTranscript.insertPlainText('You can start speaking now\n\n')
|
||||
self.show()
|
||||
logging.debug('You can start speaking now')
|
||||
context = (sctx, subproc, self.model[0])
|
||||
|
||||
# Pass the state to streaming worker
|
||||
worker = Worker(self.micWorker, context)
|
||||
worker.signals.progress.connect(self.progress)
|
||||
worker.signals.result.connect(self.transcription)
|
||||
worker.signals.finished.connect(self.micFinish)
|
||||
|
||||
# Execute
|
||||
self.threadpool.start(worker)
|
||||
else:
|
||||
logging.debug("Stop Recording")
|
||||
|
||||
'''
|
||||
Capture the audio stream from the microphone.
|
||||
The context is prepared by the openMicrophone_on_click()
|
||||
@param Context: Is a tuple containing three objects
|
||||
1. Speech samples, sctx
|
||||
2. subprocess handle
|
||||
3. Deepspeech model object
|
||||
'''
|
||||
def micWorker(self, context, progress_callback):
|
||||
# Deepspeech Streaming will be run from this method
|
||||
logging.debug("Recording from your microphone")
|
||||
while (not self.openMicrophone.isChecked()):
|
||||
data = context[1].stdout.read(512)
|
||||
context[2].feedAudioContent(context[0], np.frombuffer(data, np.int16))
|
||||
else:
|
||||
transcript = context[2].finishStream(context[0])
|
||||
context[1].terminate()
|
||||
context[1].wait()
|
||||
self.show()
|
||||
progress_callback.emit(transcript)
|
||||
return "\n*********************\nTranscription Done..."
|
||||
|
||||
def micFinish(self):
|
||||
self.openMicrophone.setText("Start Speaking")
|
||||
self.openMicrophone.setStyleSheet('QPushButton {background-color: #70cc7c; color: black;}')
|
||||
|
||||
def transcription(self, out):
|
||||
logging.debug("%s" % out)
|
||||
self.textboxTranscript.insertPlainText(out)
|
||||
self.show()
|
||||
|
||||
def wavFinish(self):
|
||||
logging.debug("File processed")
|
||||
|
||||
def progress(self, chunk):
|
||||
logging.debug("Progress: %s" % chunk)
|
||||
self.textboxTranscript.insertPlainText(chunk)
|
||||
self.show()
|
||||
|
||||
def wavWorker(self, waveFile, progress_callback):
|
||||
# Deepspeech will be run from this method
|
||||
logging.debug("Preparing for transcription...")
|
||||
inference_time = 0.0
|
||||
|
||||
# Run VAD on the input file
|
||||
segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(waveFile, 1)
|
||||
f = open(waveFile.rstrip(".wav") + ".txt", 'w')
|
||||
logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt")
|
||||
|
||||
for i, segment in enumerate(segments):
|
||||
# Run deepspeech on the chunk that just completed VAD
|
||||
logging.debug("Processing chunk %002d" % (i,))
|
||||
audio = np.frombuffer(segment, dtype=np.int16)
|
||||
output = wavTranscriber.stt(self.model[0], audio, sample_rate)
|
||||
inference_time += output[1]
|
||||
|
||||
f.write(output[0] + " ")
|
||||
progress_callback.emit(output[0] + " ")
|
||||
|
||||
# Summary of the files processed
|
||||
f.close()
|
||||
|
||||
# Format pretty, extract filename from the full file path
|
||||
filename, ext = os.path.split(os.path.basename(waveFile))
|
||||
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']
|
||||
logging.debug("************************************************************************************************************")
|
||||
logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
|
||||
logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, self.model[1], self.model[2]))
|
||||
logging.debug("************************************************************************************************************")
|
||||
print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
|
||||
print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, self.model[1], self.model[2]))
|
||||
|
||||
return "\n*********************\nTranscription Done..."
|
||||
|
||||
|
||||
def main(args):
|
||||
app = QApplication(sys.argv)
|
||||
w = App()
|
||||
sys.exit(app.exec_())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:])
|
|
@ -0,0 +1,3 @@
|
|||
deepspeech==0.6.0
|
||||
webrtcvad
|
||||
pyqt5
|
|
@ -0,0 +1,23 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -xe
|
||||
|
||||
THIS=$(dirname "$0")
|
||||
|
||||
pushd ${THIS}
|
||||
source ../tests.sh
|
||||
|
||||
pip install --user $(get_python_wheel_url "$1")
|
||||
pip install --user -r requirements.txt
|
||||
|
||||
python audioTranscript_cmd.py \
|
||||
--audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
|
||||
--aggressive 0 \
|
||||
--model $HOME/DeepSpeech/models/
|
||||
|
||||
python audioTranscript_cmd.py \
|
||||
--audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
|
||||
--aggressive 0 \
|
||||
--model $HOME/DeepSpeech/models/ \
|
||||
--stream
|
||||
popd
|
|
@ -0,0 +1,134 @@
|
|||
import collections
|
||||
import contextlib
|
||||
import wave
|
||||
|
||||
|
||||
def read_wave(path):
|
||||
"""Reads a .wav file.
|
||||
|
||||
Takes the path, and returns (PCM audio data, sample rate).
|
||||
"""
|
||||
with contextlib.closing(wave.open(path, 'rb')) as wf:
|
||||
num_channels = wf.getnchannels()
|
||||
assert num_channels == 1
|
||||
sample_width = wf.getsampwidth()
|
||||
assert sample_width == 2
|
||||
sample_rate = wf.getframerate()
|
||||
assert sample_rate in (8000, 16000, 32000)
|
||||
frames = wf.getnframes()
|
||||
pcm_data = wf.readframes(frames)
|
||||
duration = frames / sample_rate
|
||||
return pcm_data, sample_rate, duration
|
||||
|
||||
|
||||
def write_wave(path, audio, sample_rate):
|
||||
"""Writes a .wav file.
|
||||
|
||||
Takes path, PCM audio data, and sample rate.
|
||||
"""
|
||||
with contextlib.closing(wave.open(path, 'wb')) as wf:
|
||||
wf.setnchannels(1)
|
||||
wf.setsampwidth(2)
|
||||
wf.setframerate(sample_rate)
|
||||
wf.writeframes(audio)
|
||||
|
||||
|
||||
class Frame(object):
|
||||
"""Represents a "frame" of audio data."""
|
||||
def __init__(self, bytes, timestamp, duration):
|
||||
self.bytes = bytes
|
||||
self.timestamp = timestamp
|
||||
self.duration = duration
|
||||
|
||||
|
||||
def frame_generator(frame_duration_ms, audio, sample_rate):
|
||||
"""Generates audio frames from PCM audio data.
|
||||
|
||||
Takes the desired frame duration in milliseconds, the PCM data, and
|
||||
the sample rate.
|
||||
|
||||
Yields Frames of the requested duration.
|
||||
"""
|
||||
n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
|
||||
offset = 0
|
||||
timestamp = 0.0
|
||||
duration = (float(n) / sample_rate) / 2.0
|
||||
while offset + n < len(audio):
|
||||
yield Frame(audio[offset:offset + n], timestamp, duration)
|
||||
timestamp += duration
|
||||
offset += n
|
||||
|
||||
|
||||
def vad_collector(sample_rate, frame_duration_ms,
|
||||
padding_duration_ms, vad, frames):
|
||||
"""Filters out non-voiced audio frames.
|
||||
|
||||
Given a webrtcvad.Vad and a source of audio frames, yields only
|
||||
the voiced audio.
|
||||
|
||||
Uses a padded, sliding window algorithm over the audio frames.
|
||||
When more than 90% of the frames in the window are voiced (as
|
||||
reported by the VAD), the collector triggers and begins yielding
|
||||
audio frames. Then the collector waits until 90% of the frames in
|
||||
the window are unvoiced to detrigger.
|
||||
|
||||
The window is padded at the front and back to provide a small
|
||||
amount of silence or the beginnings/endings of speech around the
|
||||
voiced frames.
|
||||
|
||||
Arguments:
|
||||
|
||||
sample_rate - The audio sample rate, in Hz.
|
||||
frame_duration_ms - The frame duration in milliseconds.
|
||||
padding_duration_ms - The amount to pad the window, in milliseconds.
|
||||
vad - An instance of webrtcvad.Vad.
|
||||
frames - a source of audio frames (sequence or generator).
|
||||
|
||||
Returns: A generator that yields PCM audio data.
|
||||
"""
|
||||
num_padding_frames = int(padding_duration_ms / frame_duration_ms)
|
||||
# We use a deque for our sliding window/ring buffer.
|
||||
ring_buffer = collections.deque(maxlen=num_padding_frames)
|
||||
# We have two states: TRIGGERED and NOTTRIGGERED. We start in the
|
||||
# NOTTRIGGERED state.
|
||||
triggered = False
|
||||
|
||||
voiced_frames = []
|
||||
for frame in frames:
|
||||
is_speech = vad.is_speech(frame.bytes, sample_rate)
|
||||
|
||||
if not triggered:
|
||||
ring_buffer.append((frame, is_speech))
|
||||
num_voiced = len([f for f, speech in ring_buffer if speech])
|
||||
# If we're NOTTRIGGERED and more than 90% of the frames in
|
||||
# the ring buffer are voiced frames, then enter the
|
||||
# TRIGGERED state.
|
||||
if num_voiced > 0.9 * ring_buffer.maxlen:
|
||||
triggered = True
|
||||
# We want to yield all the audio we see from now until
|
||||
# we are NOTTRIGGERED, but we have to start with the
|
||||
# audio that's already in the ring buffer.
|
||||
for f, s in ring_buffer:
|
||||
voiced_frames.append(f)
|
||||
ring_buffer.clear()
|
||||
else:
|
||||
# We're in the TRIGGERED state, so collect the audio data
|
||||
# and add it to the ring buffer.
|
||||
voiced_frames.append(frame)
|
||||
ring_buffer.append((frame, is_speech))
|
||||
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
|
||||
# If more than 90% of the frames in the ring buffer are
|
||||
# unvoiced, then enter NOTTRIGGERED and yield whatever
|
||||
# audio we've collected.
|
||||
if num_unvoiced > 0.9 * ring_buffer.maxlen:
|
||||
triggered = False
|
||||
yield b''.join([f.bytes for f in voiced_frames])
|
||||
ring_buffer.clear()
|
||||
voiced_frames = []
|
||||
if triggered:
|
||||
pass
|
||||
# If we have any leftover voiced audio when we run out of input,
|
||||
# yield it.
|
||||
if voiced_frames:
|
||||
yield b''.join([f.bytes for f in voiced_frames])
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
import glob
|
||||
import webrtcvad
|
||||
import logging
|
||||
import wavSplit
|
||||
from deepspeech import Model
|
||||
from timeit import default_timer as timer
|
||||
|
||||
'''
|
||||
Load the pre-trained model into the memory
|
||||
@param models: Output Grapgh Protocol Buffer file
|
||||
@param lm: Language model file
|
||||
@param trie: Trie file
|
||||
|
||||
@Retval
|
||||
Returns a list [DeepSpeech Object, Model Load Time, LM Load Time]
|
||||
'''
|
||||
def load_model(models, lm, trie):
|
||||
BEAM_WIDTH = 500
|
||||
LM_ALPHA = 0.75
|
||||
LM_BETA = 1.85
|
||||
|
||||
model_load_start = timer()
|
||||
ds = Model(models, BEAM_WIDTH)
|
||||
model_load_end = timer() - model_load_start
|
||||
logging.debug("Loaded model in %0.3fs." % (model_load_end))
|
||||
|
||||
lm_load_start = timer()
|
||||
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
|
||||
lm_load_end = timer() - lm_load_start
|
||||
logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))
|
||||
|
||||
return [ds, model_load_end, lm_load_end]
|
||||
|
||||
'''
|
||||
Run Inference on input audio file
|
||||
@param ds: Deepspeech object
|
||||
@param audio: Input audio for running inference on
|
||||
@param fs: Sample rate of the input audio file
|
||||
|
||||
@Retval:
|
||||
Returns a list [Inference, Inference Time, Audio Length]
|
||||
|
||||
'''
|
||||
def stt(ds, audio, fs):
|
||||
inference_time = 0.0
|
||||
audio_length = len(audio) * (1 / fs)
|
||||
|
||||
# Run Deepspeech
|
||||
logging.debug('Running inference...')
|
||||
inference_start = timer()
|
||||
output = ds.stt(audio)
|
||||
inference_end = timer() - inference_start
|
||||
inference_time += inference_end
|
||||
logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length))
|
||||
|
||||
return [output, inference_time]
|
||||
|
||||
'''
|
||||
Resolve directory path for the models and fetch each of them.
|
||||
@param dirName: Path to the directory containing pre-trained models
|
||||
|
||||
@Retval:
|
||||
Retunns a tuple containing each of the model files (pb, lm and trie)
|
||||
'''
|
||||
def resolve_models(dirName):
|
||||
pb = glob.glob(dirName + "/*.pb")[0]
|
||||
logging.debug("Found Model: %s" % pb)
|
||||
|
||||
lm = glob.glob(dirName + "/lm.binary")[0]
|
||||
trie = glob.glob(dirName + "/trie")[0]
|
||||
logging.debug("Found Language Model: %s" % lm)
|
||||
logging.debug("Found Trie: %s" % trie)
|
||||
|
||||
return pb, lm, trie
|
||||
|
||||
'''
|
||||
Generate VAD segments. Filters out non-voiced audio frames.
|
||||
@param waveFile: Input wav file to run VAD on.0
|
||||
|
||||
@Retval:
|
||||
Returns tuple of
|
||||
segments: a bytearray of multiple smaller audio frames
|
||||
(The longer audio split into mutiple smaller one's)
|
||||
sample_rate: Sample rate of the input audio file
|
||||
audio_length: Duraton of the input audio file
|
||||
|
||||
'''
|
||||
def vad_segment_generator(wavFile, aggressiveness):
|
||||
logging.debug("Caught the wav file @: %s" % (wavFile))
|
||||
audio, sample_rate, audio_length = wavSplit.read_wave(wavFile)
|
||||
assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!"
|
||||
vad = webrtcvad.Vad(int(aggressiveness))
|
||||
frames = wavSplit.frame_generator(30, audio, sample_rate)
|
||||
frames = list(frames)
|
||||
segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames)
|
||||
|
||||
return segments, sample_rate, audio_length
|
|
@ -0,0 +1,107 @@
|
|||
## Transcribing longer audio clips
|
||||
|
||||
The Command and GUI tools perform transcription on long wav files.
|
||||
They take in a wav file of any duration, use the WebRTC Voice Activity Detector (VAD)
|
||||
to split it into smaller chunks and finally save a consolidated transcript.
|
||||
|
||||
### 0. Prerequisites
|
||||
#### 0.1 Install requiered packages
|
||||
Install the package which contains rec on the machine:
|
||||
|
||||
Fedora:
|
||||
|
||||
``` sudo dnf install sox ```
|
||||
|
||||
Tested on: 29
|
||||
|
||||
Ubuntu/Debian
|
||||
|
||||
``` sudo apt install sox ```
|
||||
|
||||
A list of distributions where the package is available can be found at: https://pkgs.org/download/sox
|
||||
|
||||
#### 0.1 Download Deepspeech
|
||||
Either clone from git via git clone, or Download a version from the release page
|
||||
|
||||
For the next steps we assume you have extracted the files to ~/Deepspeech
|
||||
|
||||
|
||||
#### 0.2 Setup your environment
|
||||
|
||||
Ubuntu/Debian:
|
||||
|
||||
```
|
||||
~/Deepspeech$ sudo apt install virtualenv
|
||||
~/Deepspeech$ cd examples/vad_transcriber
|
||||
~/Deepspeech/examples/vad_transcriber$ virtualenv -p python3 venv
|
||||
~/Deepspeech/examples/vad_transcriber$ source venv/bin/activate
|
||||
(venv) ~/Deepspeech/examples/vad_transcriber$ pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
Fedora
|
||||
|
||||
```
|
||||
~/Deepspeech$ sudo dnf install python-virtualenv
|
||||
~/Deepspeech$ cd examples/vad_transcriber
|
||||
~/Deepspeech/examples/vad_transcriber$ virtualenv -p python3 venv
|
||||
~/Deepspeech/examples/vad_transcriber$ source venv/bin/activate
|
||||
(venv) ~/Deepspeech/examples/vad_transcriber$ pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
Tested on: 29
|
||||
|
||||
### 1. Command line tool
|
||||
|
||||
The command line tool processes a wav file of any duration and returns a trancript
|
||||
which will the saved in the same directory as the input audio file.
|
||||
|
||||
The command line tool gives you control over the aggressiveness of the VAD.
|
||||
Set the aggressiveness mode, to an integer between 0 and 3.
|
||||
0 being the least aggressive about filtering out non-speech, 3 is the most aggressive.
|
||||
|
||||
```
|
||||
(venv) ~/Deepspeech/examples/vad_transcriber
|
||||
$ python3 audioTranscript_cmd.py --aggressive 1 --audio ./audio/guido-van-rossum.wav --model ./models/0.4.1/
|
||||
|
||||
|
||||
Filename Duration(s) Inference Time(s) Model Load Time(s) LM Load Time(s)
|
||||
sample_rec.wav 13.710 20.797 5.593 17.742
|
||||
|
||||
```
|
||||
|
||||
**Note:** Only `wav` files with a 16kHz sample rate are supported for now, you can convert your files to the appropriate format with ffmpeg if available on your system.
|
||||
|
||||
ffmpeg -i infile.mp3 -ar 16000 -ac 1 outfile.wav
|
||||
|
||||
### 2. Minimalistic GUI
|
||||
|
||||
The GUI tool does the same job as the CLI tool. The VAD is fixed at an aggressiveness of 1.
|
||||
The output is displayed in the transcription window and saved into the directory as the input
|
||||
audio file as well.
|
||||
|
||||
```
|
||||
(venv) ~/Deepspeech/examples/vad_transcriber
|
||||
$ python3 audioTranscript_gui.py
|
||||
|
||||
```
|
||||
|
||||
![Deepspeech Transcriber](../../doc/audioTranscript.png)
|
||||
|
||||
|
||||
#### 2.1. Sporadic failures in pyqt
|
||||
Some systems have encountered **_Cannot mix incompatible Qt library with this with this library_** issue.
|
||||
In such a scenario, the GUI tool will not work. The following steps is known to have solved the issue in most cases
|
||||
```
|
||||
(venv) ~/Deepspeech/examples/vad_transcriber$ pip3 uninstall pyqt5
|
||||
(venv) ~/Deepspeech/examples/vad_transcriber$ sudo apt install python3-pyqt5 canberra-gtk-module
|
||||
(venv) ~/Deepspeech/examples/vad_transcriber$ export PYTHONPATH=/usr/lib/python3/dist-packages/
|
||||
(venv) ~/Deepspeech/examples/vad_transcriber$ python3 audioTranscript_gui.py
|
||||
|
||||
```
|
||||
#### 2.2 Useful Tips
|
||||
##### The GUI programm immediately crashes when you press start recording
|
||||
This happens when you don't load the models via the "Browse Models" button, before pressing the "Start recording" button.
|
||||
|
||||
##### What does error XYZ mean?
|
||||
You can find a list of error codes and what they mean at https://deepspeech.readthedocs.io/en/latest/Error-Codes.html
|
||||
|
Загрузка…
Ссылка в новой задаче