This commit is contained in:
Reuben Morais 2019-12-04 16:38:56 +01:00
Родитель 1cd56b53de
Коммит 80cafe6bfb
36 изменённых файлов: 3016 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,62 @@
# FFmpeg VAD Streaming
Streaming inference from arbitrary source (FFmpeg input) to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Node.js.
This example was successfully tested with a mobile phone streaming a live feed to a RTMP server (nginx-rtmp), which then could be used by this script for near real time speech recognition.
## Installation
```bash
npm install
```
Moreover FFmpeg must be installed:
```bash
sudo apt-get install ffmpeg
```
## Usage
Here is an example for a local audio file:
```bash
node ./index.js --audio <AUDIO_FILE> \
--model $HOME/models/output_graph.pbmm \
```
Here is an example for a remote RTMP-Stream:
```bash
node ./index.js --audio rtmp://<IP>:1935/live/teststream \
--model $HOME/models/output_graph.pbmm \
```
## Examples
Real time streaming inference with DeepSpeech's example audio ([audio-0.4.1.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz)).
```bash
node ./index.js --audio $HOME/audio/2830-3980-0043.wav \
--lm $HOME/models/lm.binary \
--trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \
```
```bash
node ./index.js --audio $HOME/audio/4507-16021-0012.wav \
--lm $HOME/models/lm.binary \
--trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \
```
```bash
node ./index.js --audio $HOME/audio/8455-210777-0068.wav \
--lm $HOME/models/lm.binary \
--trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \
```
Real time streaming inference in combination with a RTMP server.
```bash
node ./index.js --audio rtmp://<HOST>/<APP>/<KEY> \
--lm $HOME/models/lm.binary \
--trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \
```
## Notes
To get the best result mapped on to your own scenario, it might be helpful to adjust the parameters `VAD_MODE` and `DEBUNCE_TIME`.

Просмотреть файл

@ -0,0 +1,123 @@
#!/usr/bin/env node
const VAD = require("node-vad");
const Ds = require('deepspeech');
const argparse = require('argparse');
const util = require('util');
const { spawn } = require('child_process');
// These constants control the beam search decoder
// Beam width used in the CTC decoder when building candidate transcriptions
const BEAM_WIDTH = 500;
// The alpha hyperparameter of the CTC decoder. Language Model weight
const LM_ALPHA = 0.75;
// The beta hyperparameter of the CTC decoder. Word insertion bonus.
const LM_BETA = 1.85;
let VersionAction = function VersionAction(options) {
options = options || {};
options.nargs = 0;
argparse.Action.call(this, options);
};
util.inherits(VersionAction, argparse.Action);
VersionAction.prototype.call = function(parser) {
Ds.printVersions();
process.exit(0);
};
let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
let args = parser.parseArgs();
function totalTime(hrtimeValue) {
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
}
console.error('Loading model from file %s', args['model']);
const model_load_start = process.hrtime();
let model = new Ds.Model(args['model'], BEAM_WIDTH);
const model_load_end = process.hrtime(model_load_start);
console.error('Loaded model in %ds.', totalTime(model_load_end));
if (args['lm'] && args['trie']) {
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
const lm_load_start = process.hrtime();
model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
const lm_load_end = process.hrtime(lm_load_start);
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
}
// Default is 16kHz
const AUDIO_SAMPLE_RATE = 16000;
// Defines different thresholds for voice detection
// NORMAL: Suitable for high bitrate, low-noise data. May classify noise as voice, too.
// LOW_BITRATE: Detection mode optimised for low-bitrate audio.
// AGGRESSIVE: Detection mode best suited for somewhat noisy, lower quality audio.
// VERY_AGGRESSIVE: Detection mode with lowest miss-rate. Works well for most inputs.
const VAD_MODE = VAD.Mode.NORMAL;
// const VAD_MODE = VAD.Mode.LOW_BITRATE;
// const VAD_MODE = VAD.Mode.AGGRESSIVE;
// const VAD_MODE = VAD.Mode.VERY_AGGRESSIVE;
// Time in milliseconds for debouncing speech active state
const DEBOUNCE_TIME = 20;
// Create voice activity stream
const VAD_STREAM = VAD.createStream({
mode: VAD_MODE,
audioFrequency: AUDIO_SAMPLE_RATE,
debounceTime: DEBOUNCE_TIME
});
// Spawn ffmpeg process
const ffmpeg = spawn('ffmpeg', [
'-hide_banner',
'-nostats',
'-loglevel', 'fatal',
'-i', args['audio'],
'-vn',
'-acodec', 'pcm_s16le',
'-ac', 1,
'-ar', AUDIO_SAMPLE_RATE,
'-f', 's16le',
'pipe:'
]);
let audioLength = 0;
let sctx = model.createStream();
function finishStream() {
const model_load_start = process.hrtime();
console.error('Running inference.');
console.log('Transcription: ', model.finishStream(sctx));
const model_load_end = process.hrtime(model_load_start);
console.error('Inference took %ds for %ds audio file.', totalTime(model_load_end), audioLength.toPrecision(4));
audioLength = 0;
}
function intermediateDecode() {
finishStream();
sctx = model.createStream();
}
function feedAudioContent(chunk) {
audioLength += (chunk.length / 2) * ( 1 / AUDIO_SAMPLE_RATE);
model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
}
function processVad(data) {
if (data.speech.start||data.speech.state) feedAudioContent(data.audioData)
else if (data.speech.end) { feedAudioContent(data.audioData); intermediateDecode() }
}
ffmpeg.stdout.pipe(VAD_STREAM).on('data', processVad);

Просмотреть файл

@ -0,0 +1,16 @@
{
"name": "ffmpeg-vad-streaming",
"version": "1.0.0",
"description": "Streaming inference from arbitrary source with VAD and FFmpeg",
"main": "index.js",
"scripts": {
"start": "node ./index.js"
},
"dependencies": {
"argparse": "^1.0.10",
"deepspeech": "0.6.0",
"node-vad": "^1.1.1",
"util": "^0.11.1"
},
"license" : "MIT"
}

27
ffmpeg_vad_streaming/test.sh Executable file
Просмотреть файл

@ -0,0 +1,27 @@
#!/bin/bash
set -xe
THIS=$(dirname "$0")
pushd ${THIS}
source ../tests.sh
npm install $(get_npm_package_url)
npm install
node ./index.js --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
--lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \
--model $HOME/DeepSpeech/models/output_graph.pbmm
node ./index.js --audio $HOME/DeepSpeech/audio/4507-16021-0012.wav \
--lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \
--model $HOME/DeepSpeech/models/output_graph.pbmm
node ./index.js --audio $HOME/DeepSpeech/audio/8455-210777-0068.wav \
--lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \
--model $HOME/DeepSpeech/models/output_graph.pbmm
popd

Просмотреть файл

@ -0,0 +1,69 @@
Microphone VAD Streaming
========================
Stream from microphone to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Python. Also useful for quick, real-time testing of models and decoding parameters.
Installation
------------
.. code-block:: bash
pip install -r requirements.txt
Uses portaudio for microphone access, so on Linux, you may need to install its header files to compile the ``pyaudio`` package:
.. code-block:: bash
sudo apt install portaudio19-dev
Installation on MacOS may fail due to portaudio, use brew to install it:
.. code-block:: bash
brew install portaudio
Usage
-----
.. code-block::
usage: mic_vad_streaming.py [-h] [-v VAD_AGGRESSIVENESS] [--nospinner]
[-w SAVEWAV] -m MODEL [-l LM]
[-t TRIE] [-nf N_FEATURES] [-nc N_CONTEXT]
[-la LM_ALPHA] [-lb LM_BETA]
[-bw BEAM_WIDTH]
Stream from microphone to DeepSpeech using VAD
optional arguments:
-h, --help show this help message and exit
-v VAD_AGGRESSIVENESS, --vad_aggressiveness VAD_AGGRESSIVENESS
Set aggressiveness of VAD: an integer between 0 and 3,
0 being the least aggressive about filtering out non-
speech, 3 the most aggressive. Default: 3
--nospinner Disable spinner
-w SAVEWAV, --savewav SAVEWAV
Save .wav files of utterences to given directory
-m MODEL, --model MODEL
Path to the model (protocol buffer binary file, or
entire directory containing all standard-named files
for model)
-l LM, --lm LM Path to the language model binary file. Default:
lm.binary
-t TRIE, --trie TRIE Path to the language model trie file created with
native_client/generate_trie. Default: trie
-nf N_FEATURES, --n_features N_FEATURES
Number of MFCC features to use. Default: 26
-nc N_CONTEXT, --n_context N_CONTEXT
Size of the context window used for producing
timesteps in the input vector. Default: 9
-la LM_ALPHA, --lm_alpha LM_ALPHA
The alpha hyperparameter of the CTC decoder. Language
Model weight. Default: 0.75
-lb LM_BETA, --lm_beta LM_BETA
The beta hyperparameter of the CTC decoder. Word insertion
bonus. Default: 1.85
-bw BEAM_WIDTH, --beam_width BEAM_WIDTH
Beam width used in the CTC decoder when building
candidate transcriptions. Default: 500

Просмотреть файл

@ -0,0 +1,237 @@
import time, logging
from datetime import datetime
import threading, collections, queue, os, os.path
import deepspeech
import numpy as np
import pyaudio
import wave
import webrtcvad
from halo import Halo
from scipy import signal
logging.basicConfig(level=20)
class Audio(object):
"""Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
FORMAT = pyaudio.paInt16
# Network/VAD rate-space
RATE_PROCESS = 16000
CHANNELS = 1
BLOCKS_PER_SECOND = 50
def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS, file=None):
def proxy_callback(in_data, frame_count, time_info, status):
#pylint: disable=unused-argument
if self.chunk is not None:
in_data = self.wf.readframes(self.chunk)
callback(in_data)
return (None, pyaudio.paContinue)
if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)
self.buffer_queue = queue.Queue()
self.device = device
self.input_rate = input_rate
self.sample_rate = self.RATE_PROCESS
self.block_size = int(self.RATE_PROCESS / float(self.BLOCKS_PER_SECOND))
self.block_size_input = int(self.input_rate / float(self.BLOCKS_PER_SECOND))
self.pa = pyaudio.PyAudio()
kwargs = {
'format': self.FORMAT,
'channels': self.CHANNELS,
'rate': self.input_rate,
'input': True,
'frames_per_buffer': self.block_size_input,
'stream_callback': proxy_callback,
}
self.chunk = None
# if not default device
if self.device:
kwargs['input_device_index'] = self.device
elif file is not None:
self.chunk = 320
self.wf = wave.open(file, 'rb')
self.stream = self.pa.open(**kwargs)
self.stream.start_stream()
def resample(self, data, input_rate):
"""
Microphone may not support our native processing sampling rate, so
resample from input_rate to RATE_PROCESS here for webrtcvad and
deepspeech
Args:
data (binary): Input audio stream
input_rate (int): Input audio rate to resample from
"""
data16 = np.fromstring(string=data, dtype=np.int16)
resample_size = int(len(data16) / self.input_rate * self.RATE_PROCESS)
resample = signal.resample(data16, resample_size)
resample16 = np.array(resample, dtype=np.int16)
return resample16.tostring()
def read_resampled(self):
"""Return a block of audio data resampled to 16000hz, blocking if necessary."""
return self.resample(data=self.buffer_queue.get(),
input_rate=self.input_rate)
def read(self):
"""Return a block of audio data, blocking if necessary."""
return self.buffer_queue.get()
def destroy(self):
self.stream.stop_stream()
self.stream.close()
self.pa.terminate()
frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)
def write_wav(self, filename, data):
logging.info("write wav %s", filename)
wf = wave.open(filename, 'wb')
wf.setnchannels(self.CHANNELS)
# wf.setsampwidth(self.pa.get_sample_size(FORMAT))
assert self.FORMAT == pyaudio.paInt16
wf.setsampwidth(2)
wf.setframerate(self.sample_rate)
wf.writeframes(data)
wf.close()
class VADAudio(Audio):
"""Filter & segment audio with voice activity detection."""
def __init__(self, aggressiveness=3, device=None, input_rate=None, file=None):
super().__init__(device=device, input_rate=input_rate, file=file)
self.vad = webrtcvad.Vad(aggressiveness)
def frame_generator(self):
"""Generator that yields all audio frames from microphone."""
if self.input_rate == self.RATE_PROCESS:
while True:
yield self.read()
else:
while True:
yield self.read_resampled()
def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):
"""Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
|---utterence---| |---utterence---|
"""
if frames is None: frames = self.frame_generator()
num_padding_frames = padding_ms // self.frame_duration_ms
ring_buffer = collections.deque(maxlen=num_padding_frames)
triggered = False
for frame in frames:
if len(frame) < 640:
return
is_speech = self.vad.is_speech(frame, self.sample_rate)
if not triggered:
ring_buffer.append((frame, is_speech))
num_voiced = len([f for f, speech in ring_buffer if speech])
if num_voiced > ratio * ring_buffer.maxlen:
triggered = True
for f, s in ring_buffer:
yield f
ring_buffer.clear()
else:
yield frame
ring_buffer.append((frame, is_speech))
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
if num_unvoiced > ratio * ring_buffer.maxlen:
triggered = False
yield None
ring_buffer.clear()
def main(ARGS):
# Load DeepSpeech model
if os.path.isdir(ARGS.model):
model_dir = ARGS.model
ARGS.model = os.path.join(model_dir, 'output_graph.pb')
ARGS.lm = os.path.join(model_dir, ARGS.lm)
ARGS.trie = os.path.join(model_dir, ARGS.trie)
print('Initializing model...')
logging.info("ARGS.model: %s", ARGS.model)
model = deepspeech.Model(ARGS.model, ARGS.beam_width)
if ARGS.lm and ARGS.trie:
logging.info("ARGS.lm: %s", ARGS.lm)
logging.info("ARGS.trie: %s", ARGS.trie)
model.enableDecoderWithLM(ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta)
# Start audio with VAD
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
device=ARGS.device,
input_rate=ARGS.rate,
file=ARGS.file)
print("Listening (ctrl-C to exit)...")
frames = vad_audio.vad_collector()
# Stream from microphone to DeepSpeech using VAD
spinner = None
if not ARGS.nospinner:
spinner = Halo(spinner='line')
stream_context = model.createStream()
wav_data = bytearray()
for frame in frames:
if frame is not None:
if spinner: spinner.start()
logging.debug("streaming frame")
model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16))
if ARGS.savewav: wav_data.extend(frame)
else:
if spinner: spinner.stop()
logging.debug("end utterence")
if ARGS.savewav:
vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
wav_data = bytearray()
text = model.finishStream(stream_context)
print("Recognized: %s" % text)
stream_context = model.createStream()
if __name__ == '__main__':
BEAM_WIDTH = 500
DEFAULT_SAMPLE_RATE = 16000
LM_ALPHA = 0.75
LM_BETA = 1.85
import argparse
parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
parser.add_argument('--nospinner', action='store_true',
help="Disable spinner")
parser.add_argument('-w', '--savewav',
help="Save .wav files of utterences to given directory")
parser.add_argument('-f', '--file',
help="Read from .wav file instead of microphone")
parser.add_argument('-m', '--model', required=True,
help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
parser.add_argument('-l', '--lm', default='lm.binary',
help="Path to the language model binary file. Default: lm.binary")
parser.add_argument('-t', '--trie', default='trie',
help="Path to the language model trie file created with native_client/generate_trie. Default: trie")
parser.add_argument('-d', '--device', type=int, default=None,
help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
parser.add_argument('-la', '--lm_alpha', type=float, default=LM_ALPHA,
help=f"The alpha hyperparameter of the CTC decoder. Language Model weight. Default: {LM_ALPHA}")
parser.add_argument('-lb', '--lm_beta', type=float, default=LM_BETA,
help=f"The beta hyperparameter of the CTC decoder. Word insertion bonus. Default: {LM_BETA}")
parser.add_argument('-bw', '--beam_width', type=int, default=BEAM_WIDTH,
help=f"Beam width used in the CTC decoder when building candidate transcriptions. Default: {BEAM_WIDTH}")
ARGS = parser.parse_args()
if ARGS.savewav: os.makedirs(ARGS.savewav, exist_ok=True)
main(ARGS)

Просмотреть файл

@ -0,0 +1,6 @@
deepspeech==0.6.0
pyaudio~=0.2.11
webrtcvad~=2.0.10
halo~=0.0.18
numpy>=1.15.1
scipy>=1.1.0

20
mic_vad_streaming/test.sh Executable file
Просмотреть файл

@ -0,0 +1,20 @@
#!/bin/bash
set -xe
THIS=$(dirname "$0")
pushd ${THIS}
source ../tests.sh
pip install --user $(get_python_wheel_url "$1")
pip install --user -r requirements.txt
pulseaudio &
python mic_vad_streaming.py \
--model $HOME/DeepSpeech/models/output_graph.pbmm \
--lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \
--file $HOME/DeepSpeech/audio/2830-3980-0043.wav
popd

330
net_framework/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,330 @@
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
**/Properties/launchSettings.json
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# CodeRush
.cr/
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/

Просмотреть файл

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.6.2" />
</startup>
</configuration>

Просмотреть файл

@ -0,0 +1,8 @@
<Application
x:Class="DeepSpeechWPF.App"
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
xmlns:local="clr-namespace:DeepSpeechWPF"
StartupUri="MainWindow.xaml">
<Application.Resources />
</Application>

Просмотреть файл

@ -0,0 +1,44 @@
using CommonServiceLocator;
using DeepSpeech.WPF.ViewModels;
using DeepSpeechClient.Interfaces;
using GalaSoft.MvvmLight.Ioc;
using System.Windows;
namespace DeepSpeechWPF
{
/// <summary>
/// Interaction logic for App.xaml
/// </summary>
public partial class App : Application
{
protected override void OnStartup(StartupEventArgs e)
{
base.OnStartup(e);
ServiceLocator.SetLocatorProvider(() => SimpleIoc.Default);
const int BEAM_WIDTH = 500;
//Register instance of DeepSpeech
DeepSpeechClient.DeepSpeech deepSpeechClient = new DeepSpeechClient.DeepSpeech();
try
{
deepSpeechClient.CreateModel("output_graph.pbmm", BEAM_WIDTH);
}
catch (System.Exception ex)
{
MessageBox.Show(ex.Message);
Current.Shutdown();
}
SimpleIoc.Default.Register<IDeepSpeech>(() => deepSpeechClient);
SimpleIoc.Default.Register<MainWindowViewModel>();
}
protected override void OnExit(ExitEventArgs e)
{
base.OnExit(e);
//Dispose instance of DeepSpeech
ServiceLocator.Current.GetInstance<IDeepSpeech>()?.Dispose();
}
}
}

Просмотреть файл

@ -0,0 +1,140 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{54BFD766-4305-4F4C-BA59-AF45505DF3C1}</ProjectGuid>
<OutputType>WinExe</OutputType>
<RootNamespace>DeepSpeech.WPF</RootNamespace>
<AssemblyName>DeepSpeech.WPF</AssemblyName>
<TargetFrameworkVersion>v4.6.2</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<ProjectTypeGuids>{60dc8134-eba5-43b8-bcc9-bb4bc16c2548};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
<WarningLevel>4</WarningLevel>
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
<Deterministic>true</Deterministic>
<NuGetPackageImportStamp>
</NuGetPackageImportStamp>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x64\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>AnyCPU</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>false</Prefer32Bit>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'">
<OutputPath>bin\x64\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>
<ItemGroup>
<Reference Include="AsyncAwaitBestPractices, Version=1.0.0.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>packages\AsyncAwaitBestPractices.3.1.0\lib\netstandard1.0\AsyncAwaitBestPractices.dll</HintPath>
</Reference>
<Reference Include="AsyncAwaitBestPractices.MVVM, Version=1.0.0.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>packages\AsyncAwaitBestPractices.MVVM.3.1.0\lib\netstandard1.0\AsyncAwaitBestPractices.MVVM.dll</HintPath>
</Reference>
<Reference Include="CommonServiceLocator, Version=2.0.2.0, Culture=neutral, PublicKeyToken=489b6accfaf20ef0, processorArchitecture=MSIL">
<HintPath>packages\CommonServiceLocator.2.0.2\lib\net45\CommonServiceLocator.dll</HintPath>
</Reference>
<Reference Include="CSCore, Version=1.2.1.2, Culture=neutral, PublicKeyToken=5a08f2b6f4415dea, processorArchitecture=MSIL">
<HintPath>packages\CSCore.1.2.1.2\lib\net35-client\CSCore.dll</HintPath>
</Reference>
<Reference Include="GalaSoft.MvvmLight, Version=5.4.1.0, Culture=neutral, PublicKeyToken=e7570ab207bcb616, processorArchitecture=MSIL">
<HintPath>packages\MvvmLightLibs.5.4.1.1\lib\net45\GalaSoft.MvvmLight.dll</HintPath>
</Reference>
<Reference Include="GalaSoft.MvvmLight.Extras, Version=5.4.1.0, Culture=neutral, PublicKeyToken=669f0b5e8f868abf, processorArchitecture=MSIL">
<HintPath>packages\MvvmLightLibs.5.4.1.1\lib\net45\GalaSoft.MvvmLight.Extras.dll</HintPath>
</Reference>
<Reference Include="GalaSoft.MvvmLight.Platform, Version=5.4.1.0, Culture=neutral, PublicKeyToken=5f873c45e98af8a1, processorArchitecture=MSIL">
<HintPath>packages\MvvmLightLibs.5.4.1.1\lib\net45\GalaSoft.MvvmLight.Platform.dll</HintPath>
</Reference>
<Reference Include="NAudio, Version=1.9.0.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>packages\NAudio.1.9.0\lib\net35\NAudio.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Data" />
<Reference Include="System.Windows.Forms" />
<Reference Include="System.Windows.Interactivity, Version=4.5.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>packages\MvvmLightLibs.5.4.1.1\lib\net45\System.Windows.Interactivity.dll</HintPath>
</Reference>
<Reference Include="System.Xml" />
<Reference Include="Microsoft.CSharp" />
<Reference Include="System.Core" />
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="System.Net.Http" />
<Reference Include="System.Xaml">
<RequiredTargetFramework>4.0</RequiredTargetFramework>
</Reference>
<Reference Include="WindowsBase" />
<Reference Include="PresentationCore" />
<Reference Include="PresentationFramework" />
</ItemGroup>
<ItemGroup>
<ApplicationDefinition Include="App.xaml">
<Generator>MSBuild:Compile</Generator>
<SubType>Designer</SubType>
</ApplicationDefinition>
<Compile Include="ViewModels\MainWindowViewModel.cs" />
<Page Include="MainWindow.xaml">
<Generator>MSBuild:Compile</Generator>
<SubType>Designer</SubType>
</Page>
<Compile Include="App.xaml.cs">
<DependentUpon>App.xaml</DependentUpon>
<SubType>Code</SubType>
</Compile>
<Compile Include="ViewModels\BindableBase.cs" />
<Compile Include="MainWindow.xaml.cs">
<DependentUpon>MainWindow.xaml</DependentUpon>
<SubType>Code</SubType>
</Compile>
</ItemGroup>
<ItemGroup>
<Compile Include="Properties\AssemblyInfo.cs">
<SubType>Code</SubType>
</Compile>
<Compile Include="Properties\Resources.Designer.cs">
<AutoGen>True</AutoGen>
<DesignTime>True</DesignTime>
<DependentUpon>Resources.resx</DependentUpon>
</Compile>
<Compile Include="Properties\Settings.Designer.cs">
<AutoGen>True</AutoGen>
<DependentUpon>Settings.settings</DependentUpon>
<DesignTimeSharedInput>True</DesignTimeSharedInput>
</Compile>
<EmbeddedResource Include="Properties\Resources.resx">
<Generator>ResXFileCodeGenerator</Generator>
<LastGenOutput>Resources.Designer.cs</LastGenOutput>
</EmbeddedResource>
<None Include="packages.config" />
<None Include="Properties\Settings.settings">
<Generator>SettingsSingleFileGenerator</Generator>
<LastGenOutput>Settings.Designer.cs</LastGenOutput>
</None>
</ItemGroup>
<ItemGroup>
<None Include="App.config" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\native_client\dotnet\DeepSpeechClient\DeepSpeechClient.csproj">
<Project>{56de4091-bbbe-47e4-852d-7268b33b971f}</Project>
<Name>DeepSpeechClient</Name>
</ProjectReference>
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
</Project>

Просмотреть файл

@ -0,0 +1,31 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.28307.421
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeech.WPF", "DeepSpeech.WPF.csproj", "{54BFD766-4305-4F4C-BA59-AF45505DF3C1}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechClient", "..\..\..\native_client\dotnet\DeepSpeechClient\DeepSpeechClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Debug|x64.ActiveCfg = Debug|x64
{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Debug|x64.Build.0 = Debug|x64
{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Release|x64.ActiveCfg = Release|x64
{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Release|x64.Build.0 = Release|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {19C58802-CCEC-4FD1-8D17-A6EB766116F7}
EndGlobalSection
EndGlobal

Просмотреть файл

@ -0,0 +1,102 @@
<Window
x:Class="DeepSpeechWPF.MainWindow"
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
Title="Deepspeech client"
Width="800"
Height="600"
Loaded="Window_Loaded"
WindowStartupLocation="CenterScreen"
mc:Ignorable="d">
<Grid>
<Grid.RowDefinitions>
<RowDefinition Height="222" />
<RowDefinition />
</Grid.RowDefinitions>
<TextBox
Grid.Row="1"
Margin="10,36,10,10"
FontSize="16px"
Text="{Binding Transcription, Mode=OneWay}"
TextWrapping="Wrap" />
<Label
Grid.Row="1"
Height="26"
Margin="10,5,10,0"
VerticalAlignment="Top"
Content="Results:" />
<Label
Height="26"
Margin="10,10,10,0"
VerticalAlignment="Top"
Content="Select an audio file to transcript:" />
<TextBox
Height="23"
Margin="10,41,10,0"
VerticalAlignment="Top"
Text="{Binding AudioFilePath, Mode=TwoWay}"
TextWrapping="Wrap" />
<Button
Width="80"
Height="25"
Margin="10,69,0,0"
HorizontalAlignment="Left"
VerticalAlignment="Top"
Command="{Binding SelectFileCommand}"
Content="Open file" />
<Button
Width="82"
Height="25"
Margin="95,69,0,0"
HorizontalAlignment="Left"
VerticalAlignment="Top"
Command="{Binding EnableLanguageModelCommand}"
Content="Enable LM" />
<Button
Width="75"
Height="25"
Margin="182,69,0,0"
HorizontalAlignment="Left"
VerticalAlignment="Top"
Command="{Binding InferenceFromFileCommand}"
Content="Transcript" />
<Label
Height="30"
Margin="10,99,10,0"
VerticalAlignment="Top"
Content="{Binding StatusMessage, Mode=OneWay}" />
<Label
Height="26"
Margin="10,158,10,0"
VerticalAlignment="Top"
Content="Select an audio input:" />
<ComboBox
Height="23"
Margin="20,189,186,0"
VerticalAlignment="Top"
DisplayMemberPath="FriendlyName"
ItemsSource="{Binding AvailableRecordDevices, Mode=TwoWay}"
SelectedIndex="0"
SelectedItem="{Binding SelectedDevice, Mode=TwoWay}" />
<Button
Width="91"
Height="23"
Margin="0,0,90,10"
HorizontalAlignment="Right"
VerticalAlignment="Bottom"
Command="{Binding StartRecordingCommand}"
Content="Record"
IsEnabled="{Binding EnableStartRecord, Mode=OneWay}" />
<Button
Width="75"
Height="23"
Margin="0,0,10,10"
HorizontalAlignment="Right"
VerticalAlignment="Bottom"
Command="{Binding StopRecordingCommand}"
Content="Stop"
IsEnabled="{Binding EnableStopRecord, Mode=OneWay}" />
</Grid>
</Window>

Просмотреть файл

@ -0,0 +1,17 @@
using CommonServiceLocator;
using DeepSpeech.WPF.ViewModels;
using System.Windows;
namespace DeepSpeechWPF
{
/// <summary>
/// Interaction logic for MainWindow.xaml
/// </summary>
public partial class MainWindow : Window
{
public MainWindow() => InitializeComponent();
private void Window_Loaded(object sender, RoutedEventArgs e) =>
DataContext = ServiceLocator.Current.GetInstance<MainWindowViewModel>();
}
}

Просмотреть файл

@ -0,0 +1,55 @@
using System.Reflection;
using System.Resources;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Windows;
// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[assembly: AssemblyTitle("DeepSpeech.WPF")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("DeepSpeech.WPF.SingleFiles")]
[assembly: AssemblyCopyright("Copyright © 2018")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]
// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[assembly: ComVisible(false)]
//In order to begin building localizable applications, set
//<UICulture>CultureYouAreCodingWith</UICulture> in your .csproj file
//inside a <PropertyGroup>. For example, if you are using US english
//in your source files, set the <UICulture> to en-US. Then uncomment
//the NeutralResourceLanguage attribute below. Update the "en-US" in
//the line below to match the UICulture setting in the project file.
//[assembly: NeutralResourcesLanguage("en-US", UltimateResourceFallbackLocation.Satellite)]
[assembly: ThemeInfo(
ResourceDictionaryLocation.None, //where theme specific resource dictionaries are located
//(used if a resource is not found in the page,
// or application resource dictionaries)
ResourceDictionaryLocation.SourceAssembly //where the generic resource dictionary is located
//(used if a resource is not found in the page,
// app, or any theme specific resource dictionaries)
)]
// Version information for an assembly consists of the following four values:
//
// Major Version
// Minor Version
// Build Number
// Revision
//
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]

63
net_framework/DeepSpeechWPF/Properties/Resources.Designer.cs сгенерированный Normal file
Просмотреть файл

@ -0,0 +1,63 @@
//------------------------------------------------------------------------------
// <auto-generated>
// This code was generated by a tool.
// Runtime Version:4.0.30319.42000
//
// Changes to this file may cause incorrect behavior and will be lost if
// the code is regenerated.
// </auto-generated>
//------------------------------------------------------------------------------
namespace DeepSpeech.WPF.Properties {
using System;
/// <summary>
/// A strongly-typed resource class, for looking up localized strings, etc.
/// </summary>
// This class was auto-generated by the StronglyTypedResourceBuilder
// class via a tool like ResGen or Visual Studio.
// To add or remove a member, edit your .ResX file then rerun ResGen
// with the /str option, or rebuild your VS project.
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "15.0.0.0")]
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
internal class Resources {
private static global::System.Resources.ResourceManager resourceMan;
private static global::System.Globalization.CultureInfo resourceCulture;
[global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")]
internal Resources() {
}
/// <summary>
/// Returns the cached ResourceManager instance used by this class.
/// </summary>
[global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
internal static global::System.Resources.ResourceManager ResourceManager {
get {
if (object.ReferenceEquals(resourceMan, null)) {
global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("DeepSpeech.WPF.Properties.Resources", typeof(Resources).Assembly);
resourceMan = temp;
}
return resourceMan;
}
}
/// <summary>
/// Overrides the current thread's CurrentUICulture property for all
/// resource lookups using this strongly typed resource class.
/// </summary>
[global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
internal static global::System.Globalization.CultureInfo Culture {
get {
return resourceCulture;
}
set {
resourceCulture = value;
}
}
}
}

Просмотреть файл

@ -0,0 +1,117 @@
<?xml version="1.0" encoding="utf-8"?>
<root>
<!--
Microsoft ResX Schema
Version 2.0
The primary goals of this format is to allow a simple XML format
that is mostly human readable. The generation and parsing of the
various data types are done through the TypeConverter classes
associated with the data types.
Example:
... ado.net/XML headers & schema ...
<resheader name="resmimetype">text/microsoft-resx</resheader>
<resheader name="version">2.0</resheader>
<resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
<resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
<data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
<data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
<data name="Bitmap1" mimetype="application/x-microsoft.net.object.binary.base64">
<value>[base64 mime encoded serialized .NET Framework object]</value>
</data>
<data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/x-microsoft.net.object.bytearray.base64">
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
<comment>This is a comment</comment>
</data>
There are any number of "resheader" rows that contain simple
name/value pairs.
Each data row contains a name, and value. The row also contains a
type or mimetype. Type corresponds to a .NET class that support
text/value conversion through the TypeConverter architecture.
Classes that don't support this are serialized and stored with the
mimetype set.
The mimetype is used for serialized objects, and tells the
ResXResourceReader how to depersist the object. This is currently not
extensible. For a given mimetype the value must be set accordingly:
Note - application/x-microsoft.net.object.binary.base64 is the format
that the ResXResourceWriter will generate, however the reader can
read any of the formats listed below.
mimetype: application/x-microsoft.net.object.binary.base64
value : The object must be serialized with
: System.Serialization.Formatters.Binary.BinaryFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.soap.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.bytearray.base64
value : The object must be serialized into a byte array
: using a System.ComponentModel.TypeConverter
: and then encoded with base64 encoding.
-->
<xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
<xsd:element name="root" msdata:IsDataSet="true">
<xsd:complexType>
<xsd:choice maxOccurs="unbounded">
<xsd:element name="metadata">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" />
<xsd:attribute name="type" type="xsd:string" />
<xsd:attribute name="mimetype" type="xsd:string" />
</xsd:complexType>
</xsd:element>
<xsd:element name="assembly">
<xsd:complexType>
<xsd:attribute name="alias" type="xsd:string" />
<xsd:attribute name="name" type="xsd:string" />
</xsd:complexType>
</xsd:element>
<xsd:element name="data">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
<xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" msdata:Ordinal="1" />
<xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
<xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
</xsd:complexType>
</xsd:element>
<xsd:element name="resheader">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" />
</xsd:complexType>
</xsd:element>
</xsd:choice>
</xsd:complexType>
</xsd:element>
</xsd:schema>
<resheader name="resmimetype">
<value>text/microsoft-resx</value>
</resheader>
<resheader name="version">
<value>2.0</value>
</resheader>
<resheader name="reader">
<value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
<resheader name="writer">
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
</root>

26
net_framework/DeepSpeechWPF/Properties/Settings.Designer.cs сгенерированный Normal file
Просмотреть файл

@ -0,0 +1,26 @@
//------------------------------------------------------------------------------
// <auto-generated>
// This code was generated by a tool.
// Runtime Version:4.0.30319.42000
//
// Changes to this file may cause incorrect behavior and will be lost if
// the code is regenerated.
// </auto-generated>
//------------------------------------------------------------------------------
namespace DeepSpeech.WPF.Properties {
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "15.9.0.0")]
internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase {
private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings())));
public static Settings Default {
get {
return defaultInstance;
}
}
}
}

Просмотреть файл

@ -0,0 +1,7 @@
<?xml version='1.0' encoding='utf-8'?>
<SettingsFile xmlns="uri:settings" CurrentProfile="(Default)">
<Profiles>
<Profile Name="(Default)" />
</Profiles>
<Settings />
</SettingsFile>

Просмотреть файл

@ -0,0 +1,49 @@
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Runtime.CompilerServices;
namespace DeepSpeech.WPF.ViewModels
{
/// <summary>
/// Implementation of <see cref="INotifyPropertyChanged"/> to simplify models.
/// </summary>
public abstract class BindableBase : INotifyPropertyChanged
{
/// <summary>
/// Checks if a property already matches a desired value. Sets the property and
/// notifies listeners only when necessary.
/// </summary>
/// <typeparam name="T">Type of the property.</typeparam>
/// <param name="storage">Reference to a property with both getter and setter.</param>
/// <param name="value">Desired value for the property.</param>
/// <param name="propertyName">Name of the property used to notify listeners. This
/// value is optional and can be provided automatically when invoked from compilers that
/// support CallerMemberName.</param>
/// <returns>True if the value was changed, false if the existing value matched the
/// desired value.</returns>
protected bool SetProperty<T>(ref T backingStore, T value,
[CallerMemberName]string propertyName = "",
Action onChanged = null)
{
if (EqualityComparer<T>.Default.Equals(backingStore, value))
return false;
backingStore = value;
onChanged?.Invoke();
OnPropertyChanged(propertyName);
return true;
}
#region INotifyPropertyChanged
/// <summary>
/// Notifies listeners that a property value has changed.
/// </summary>
/// <param name="propertyName">Name of the property used to notify listeners. This
/// value is optional and can be provided automatically when invoked from compilers
/// that support <see cref="CallerMemberNameAttribute"/>.</param>
public event PropertyChangedEventHandler PropertyChanged;
protected void OnPropertyChanged([CallerMemberName] string propertyName = "")
=> PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(propertyName));
#endregion
}
}

Просмотреть файл

@ -0,0 +1,422 @@
using AsyncAwaitBestPractices.MVVM;
using CSCore;
using CSCore.CoreAudioAPI;
using CSCore.SoundIn;
using CSCore.Streams;
using DeepSpeechClient.Interfaces;
using GalaSoft.MvvmLight.CommandWpf;
using Microsoft.Win32;
using System;
using System.Collections.Concurrent;
using System.Collections.ObjectModel;
using System.Diagnostics;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
namespace DeepSpeech.WPF.ViewModels
{
/// <summary>
/// View model of the MainWindow View.
/// </summary>
public class MainWindowViewModel : BindableBase
{
#region Constants
private const int SampleRate = 16000;
private const string LMPath = "lm.binary";
private const string TriePath = "trie";
#endregion
private readonly IDeepSpeech _sttClient;
#region Commands
/// <summary>
/// Gets or sets the command that enables the language model.
/// </summary>
public IAsyncCommand EnableLanguageModelCommand { get; private set; }
/// <summary>
/// Gets or sets the command that runs inference using an audio file.
/// </summary>
public IAsyncCommand InferenceFromFileCommand { get; private set; }
/// <summary>
/// Gets or sets the command that opens a dialog to select an audio file.
/// </summary>
public RelayCommand SelectFileCommand { get; private set; }
/// <summary>
/// Gets or sets the command that starts to record.
/// </summary>
public RelayCommand StartRecordingCommand { get; private set; }
/// <summary>
/// Gets or sets the command that stops the recording and gets the result.
/// </summary>
public IAsyncCommand StopRecordingCommand { get; private set; }
#endregion
#region Streaming
/// <summary>
/// Records the audio of the selected device.
/// </summary>
private WasapiCapture _audioCapture;
/// <summary>
/// Converts the device source into a wavesource.
/// </summary>
private SoundInSource _soundInSource;
/// <summary>
/// Target wave source.(16KHz Mono 16bit for DeepSpeech)
/// </summary>
private IWaveSource _convertedSource;
/// <summary>
/// Queue that prevents feeding data to the inference engine if it is busy.
/// </summary>
private ConcurrentQueue<short[]> _bufferQueue = new ConcurrentQueue<short[]>();
private int _threadSafeBoolBackValue = 0;
/// <summary>
/// Lock to process items in the queue one at time.
/// </summary>
public bool StreamingIsBusy
{
get => (Interlocked.CompareExchange(ref _threadSafeBoolBackValue, 1, 1) == 1);
set
{
if (value) Interlocked.CompareExchange(ref _threadSafeBoolBackValue, 1, 0);
else Interlocked.CompareExchange(ref _threadSafeBoolBackValue, 0, 1);
}
}
#endregion
#region ViewProperties
private bool _enableStartRecord;
/// <summary>
/// Gets or sets record status to control the record command.
/// </summary>
public bool EnableStartRecord
{
get => _enableStartRecord;
set => SetProperty(ref _enableStartRecord, value);
}
private bool _stopRecordStopRecord;
/// <summary>
/// Gets or sets record status to control stop command.
/// </summary>
public bool EnableStopRecord
{
get => _stopRecordStopRecord;
set => SetProperty(ref _stopRecordStopRecord, value,
onChanged: ()=> ((AsyncCommand)StopRecordingCommand).RaiseCanExecuteChanged());
}
private MMDevice _selectedDevice;
/// <summary>
/// Gets or sets the selected recording device.
/// </summary>
public MMDevice SelectedDevice
{
get => _selectedDevice;
set => SetProperty(ref _selectedDevice, value,
onChanged: UpdateSelectedDevice);
}
private string _statusMessage;
/// <summary>
/// Gets or sets status message.
/// </summary>
public string StatusMessage
{
get => _statusMessage;
set => SetProperty(ref _statusMessage, value);
}
private bool _languageModelEnabled;
/// <summary>
/// Gets or sets the language model status.
/// </summary>
private bool LanguageModelEnabled
{
get => _languageModelEnabled;
set => SetProperty(ref _languageModelEnabled, value,
onChanged: () => ((AsyncCommand)EnableLanguageModelCommand).RaiseCanExecuteChanged());
}
private bool _isRunningInference;
/// <summary>
/// Gets or sets whenever the model is running inference.
/// </summary>
private bool IsRunningInference
{
get => _isRunningInference;
set => SetProperty(ref _isRunningInference, value,
onChanged: () => ((AsyncCommand)InferenceFromFileCommand).RaiseCanExecuteChanged());
}
private string _transcription;
/// <summary>
/// Gets or sets the current transcription.
/// </summary>
public string Transcription
{
get => _transcription;
set => SetProperty(ref _transcription, value);
}
private string _audioFilePaht;
/// <summary>
/// Gets or sets the selected audio file path.
/// </summary>
public string AudioFilePath
{
get => _audioFilePaht;
set => SetProperty(ref _audioFilePaht, value);
}
private ObservableCollection<MMDevice> _deviceNames;
/// <summary>
/// Gets or sets the available recording devices.
/// </summary>
public ObservableCollection<MMDevice> AvailableRecordDevices
{
get => _deviceNames;
set => SetProperty(ref _deviceNames, value);
}
#endregion
#region Ctors
public MainWindowViewModel(IDeepSpeech sttClient)
{
_sttClient = sttClient;
EnableLanguageModelCommand = new AsyncCommand(()=>EnableLanguageModelAsync(LMPath,TriePath),
_ => !LanguageModelEnabled);
InferenceFromFileCommand = new AsyncCommand(ExecuteInferenceFromFileAsync,
_ => !IsRunningInference);
SelectFileCommand = new RelayCommand(SelectAudioFile);
StartRecordingCommand = new RelayCommand(StartRecording,
canExecute: CanExecuteStartRecording);
StopRecordingCommand = new AsyncCommand(StopRecordingAsync,
_ => EnableStopRecord);
LoadAvailableCaptureDevices();
}
#endregion
/// <summary>
/// Releases the current capture device and initializes the selected one.
/// </summary>
private void UpdateSelectedDevice()
{
ReleaseCapture();
InitializeAudioCapture();
}
/// <summary>
/// Releases the capture device.
/// </summary>
private void ReleaseCapture()
{
if (_audioCapture != null)
{
_audioCapture.DataAvailable -= Capture_DataAvailable;
_audioCapture.Dispose();
}
}
/// <summary>
/// Command usage to know when the recording can start.
/// </summary>
/// <returns>If the device is not null.</returns>
private bool CanExecuteStartRecording() =>
SelectedDevice != null;
/// <summary>
/// Loads all the available audio capture devices.
/// </summary>
private void LoadAvailableCaptureDevices()
{
AvailableRecordDevices = new ObservableCollection<MMDevice>(
MMDeviceEnumerator.EnumerateDevices(DataFlow.All, DeviceState.Active)); //we get only enabled devices
EnableStartRecord = true;
if (AvailableRecordDevices?.Count != 0)
SelectedDevice = AvailableRecordDevices[0];
}
/// <summary>
/// Initializes the capture source.
/// </summary>
private void InitializeAudioCapture()
{
if (SelectedDevice != null)
{
_audioCapture = SelectedDevice.DataFlow == DataFlow.Capture ?
new WasapiCapture() : new WasapiLoopbackCapture();
_audioCapture.Device = SelectedDevice;
_audioCapture.Initialize();
_audioCapture.DataAvailable += Capture_DataAvailable;
_soundInSource = new SoundInSource(_audioCapture) { FillWithZeros = false };
//create a source, that converts the data provided by the
//soundInSource to required format
_convertedSource = _soundInSource
.ChangeSampleRate(SampleRate) // sample rate
.ToSampleSource()
.ToWaveSource(16); //bits per sample
_convertedSource = _convertedSource.ToMono();
}
}
private void Capture_DataAvailable(object sender, DataAvailableEventArgs e)
{
//read data from the converedSource
//important: don't use the e.Data here
//the e.Data contains the raw data provided by the
//soundInSource which won't have the deepspeech required audio format
byte[] buffer = new byte[_convertedSource.WaveFormat.BytesPerSecond / 2];
int read;
//keep reading as long as we still get some data
while ((read = _convertedSource.Read(buffer, 0, buffer.Length)) > 0)
{
short[] sdata = new short[(int)Math.Ceiling(Convert.ToDecimal(read / 2))];
Buffer.BlockCopy(buffer, 0, sdata, 0, read);
_bufferQueue.Enqueue(sdata);
Task.Run(() => OnNewData());
}
}
/// <summary>
/// Starts processing data from the queue.
/// </summary>
private void OnNewData()
{
while (!StreamingIsBusy && !_bufferQueue.IsEmpty)
{
if (_bufferQueue.TryDequeue(out short[] buffer))
{
StreamingIsBusy = true;
_sttClient.FeedAudioContent(buffer, Convert.ToUInt32(buffer.Length));
StreamingIsBusy = false;
}
}
}
/// <summary>
/// Enables the language model.
/// </summary>
/// <param name="lmPath">Language model path.</param>
/// <param name="triePath">Trie path.</param>
/// <returns>A Task to await.</returns>
public async Task EnableLanguageModelAsync(string lmPath, string triePath)
{
try
{
StatusMessage = "Loading language model...";
const float LM_ALPHA = 0.75f;
const float LM_BETA = 1.85f;
await Task.Run(() => _sttClient.EnableDecoderWithLM(LMPath, TriePath, LM_ALPHA, LM_BETA));
LanguageModelEnabled = true;
StatusMessage = "Language model loaded.";
}
catch (Exception ex)
{
StatusMessage = ex.Message;
}
}
/// <summary>
/// Runs inference and sets the transcription of an audio file.
/// </summary>
/// <returns>A Task to await.</returns>
public async Task ExecuteInferenceFromFileAsync()
{
try
{
IsRunningInference = true;
Transcription = string.Empty;
StatusMessage = "Running inference...";
Stopwatch watch = new Stopwatch();
var waveBuffer = new NAudio.Wave.WaveBuffer(File.ReadAllBytes(AudioFilePath));
using (var waveInfo = new NAudio.Wave.WaveFileReader(AudioFilePath))
{
watch.Start();
string speechResult = await Task.Run(() => _sttClient.SpeechToText(
waveBuffer.ShortBuffer,
Convert.ToUInt32(waveBuffer.MaxSize / 2)));
watch.Stop();
Transcription = $"Audio duration: {waveInfo.TotalTime.ToString()} {Environment.NewLine}" +
$"Inference took: {watch.Elapsed.ToString()} {Environment.NewLine}" +
$"Recognized text: {speechResult}";
}
waveBuffer.Clear();
StatusMessage = string.Empty;
}
catch (Exception ex)
{
StatusMessage = ex.Message;
}
finally
{
IsRunningInference = false;
}
}
/// <summary>
/// Stops the recording and sets the transcription of the closed stream.
/// </summary>
/// <returns>A Task to await.</returns>
private async Task StopRecordingAsync()
{
EnableStopRecord = false;
_audioCapture.Stop();
while (!_bufferQueue.IsEmpty && StreamingIsBusy) //we wait for all the queued buffers to be processed
{
await Task.Delay(90);
}
Transcription = _sttClient.FinishStream();
EnableStartRecord = true;
}
/// <summary>
/// Creates a new stream and starts the recording.
/// </summary>
private void StartRecording()
{
_sttClient.CreateStream();
_audioCapture.Start();
EnableStartRecord = false;
EnableStopRecord = true;
}
/// <summary>
/// Opens a dialog to select an audio file.
/// </summary>
private void SelectAudioFile()
{
OpenFileDialog dialog = new OpenFileDialog
{
Filter = "wav Files |*.wav",
Multiselect = false,
Title = "Please select a wav file."
};
if ((bool)dialog.ShowDialog())
{
AudioFilePath = dialog.FileName;
}
}
}
}

Просмотреть файл

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="AsyncAwaitBestPractices" version="3.1.0" targetFramework="net462" />
<package id="AsyncAwaitBestPractices.MVVM" version="3.1.0" targetFramework="net462" />
<package id="CommonServiceLocator" version="2.0.2" targetFramework="net462" />
<package id="CSCore" version="1.2.1.2" targetFramework="net462" />
<package id="MvvmLightLibs" version="5.4.1.1" targetFramework="net462" />
<package id="NAudio" version="1.9.0" targetFramework="net462" />
</packages>

58
nodejs_wav/Readme.md Normal file
Просмотреть файл

@ -0,0 +1,58 @@
# NodeJS voice recognition example using Mozilla DeepSpeech
Download the pre-trained model (1.8GB):
```
wget https://github.com/mozilla/DeepSpeech/releases/download/v0.6.0/deepspeech-0.6.0-models.tar.gz
tar xvfz deepspeech-0.6.0-models.tar.gz
```
Edit references to models path if necessary:
```
let modelPath = './models/output_graph.pbmm';
let lmPath = './models/lm.binary';
let triePath = './models/trie';
```
Install Sox (for .wav file loading):
```
brew install sox
```
Download test audio files:
```
wget https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz
tar xfvz audio-0.4.1.tar.gz
```
Install NPM dependencies:
```
npm install
```
Run:
```
node index.js
```
Result should be something like:
```
audio length 1.975
result: experience proves this
```
Try other wav files with an argument:
```
node index.js audio/2830-3980-0043.wav
node index.js audio/8455-210777-0068.wav
node index.js audio/4507-16021-0012.wav
```

70
nodejs_wav/index.js Normal file
Просмотреть файл

@ -0,0 +1,70 @@
const DeepSpeech = require('deepspeech');
const Fs = require('fs');
const Sox = require('sox-stream');
const MemoryStream = require('memory-stream');
const Duplex = require('stream').Duplex;
const Wav = require('node-wav');
const BEAM_WIDTH = 1024;
let modelPath = './models/output_graph.pbmm';
let model = new DeepSpeech.Model(modelPath, BEAM_WIDTH);
let desiredSampleRate = model.sampleRate();
const LM_ALPHA = 0.75;
const LM_BETA = 1.85;
let lmPath = './models/lm.binary';
let triePath = './models/trie';
model.enableDecoderWithLM(lmPath, triePath, LM_ALPHA, LM_BETA);
let audioFile = process.argv[2] || './audio/2830-3980-0043.wav';
if (!Fs.existsSync(audioFile)) {
console.log('file missing:', audioFile);
process.exit();
}
const buffer = Fs.readFileSync(audioFile);
const result = Wav.decode(buffer);
if (result.sampleRate < desiredSampleRate) {
console.error('Warning: original sample rate (' + result.sampleRate + ') is lower than ' + desiredSampleRate + 'Hz. Up-sampling might produce erratic speech recognition.');
}
function bufferToStream(buffer) {
let stream = new Duplex();
stream.push(buffer);
stream.push(null);
return stream;
}
let audioStream = new MemoryStream();
bufferToStream(buffer).
pipe(Sox({
global: {
'no-dither': true,
},
output: {
bits: 16,
rate: desiredSampleRate,
channels: 1,
encoding: 'signed-integer',
endian: 'little',
compression: 0.0,
type: 'raw'
}
})).
pipe(audioStream);
audioStream.on('finish', () => {
let audioBuffer = audioStream.toBuffer();
const audioLength = (audioBuffer.length / 2) * (1 / desiredSampleRate);
console.log('audio length', audioLength);
let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2));
console.log('result:', result);
});

17
nodejs_wav/package.json Normal file
Просмотреть файл

@ -0,0 +1,17 @@
{
"name": "deepspeech-nodejs_wav",
"version": "1.0.0",
"description": "Simple audio processing",
"main": "index.js",
"scripts": {
"start": "node ./index.js"
},
"dependencies": {
"argparse": "^1.0.10",
"deepspeech": "0.6.0",
"node-wav": "0.0.2",
"sox-stream": "^2.0.3",
"util": "^0.11.1"
},
"license": "Public domain"
}

18
nodejs_wav/test.sh Executable file
Просмотреть файл

@ -0,0 +1,18 @@
#!/bin/bash
set -xe
THIS=$(dirname "$0")
pushd ${THIS}
source ../tests.sh
npm install $(get_npm_package_url)
npm install
ln -s $HOME/DeepSpeech/models models
node index.js $HOME/DeepSpeech/audio/2830-3980-0043.wav
node index.js $HOME/DeepSpeech/audio/8455-210777-0068.wav
node index.js $HOME/DeepSpeech/audio/4507-16021-0012.wav
popd

23
tests.sh Executable file
Просмотреть файл

@ -0,0 +1,23 @@
#!/bin/bash
set -xe
THIS=$(dirname "$0")
source ../../taskcluster/tc-tests-utils.sh
DEP_TASK_ID=$(curl -s https://community-tc.services.mozilla.com/api/queue/v1/task/${TASK_ID} | python -c 'import json; import sys; print(" ".join(json.loads(sys.stdin.read())["dependencies"]));')
get_python_wheel_url()
{
local this_python_version=$1
extract_python_versions "${this_python_version}" "pyver" "pyver_pkg" "py_unicode_type" "pyconf" "pyalias"
echo "$(get_python_pkg_url "${pyver_pkg}" "${py_unicode_type}" "deepspeech" https://community-tc.services.mozilla.com/api/queue/v1/task/${DEP_TASK_ID}/artifacts/public)"
}
get_npm_package_url()
{
echo "https://community-tc.services.mozilla.com/api/queue/v1/task/${DEP_TASK_ID}/artifacts/public/deepspeech-${DS_VERSION}.tgz"
}

Просмотреть файл

@ -0,0 +1,92 @@
import sys
import os
import logging
import argparse
import subprocess
import shlex
import numpy as np
import wavTranscriber
# Debug helpers
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
def main(args):
parser = argparse.ArgumentParser(description='Transcribe long audio files using webRTC VAD or use the streaming interface')
parser.add_argument('--aggressive', type=int, choices=range(4), required=False,
help='Determines how aggressive filtering out non-speech is. (Interger between 0-3)')
parser.add_argument('--audio', required=False,
help='Path to the audio file to run (WAV format)')
parser.add_argument('--model', required=True,
help='Path to directory that contains all model files (output_graph, lm and trie)')
parser.add_argument('--stream', required=False, action='store_true',
help='To use deepspeech streaming interface')
args = parser.parse_args()
if args.stream is True:
print("Opening mic for streaming")
elif args.audio is not None:
logging.debug("Transcribing audio file @ %s" % args.audio)
else:
parser.print_help()
parser.exit()
# Point to a path containing the pre-trained models & resolve ~ if used
dirName = os.path.expanduser(args.model)
# Resolve all the paths of model files
output_graph, lm, trie = wavTranscriber.resolve_models(dirName)
# Load output_graph, alpahbet, lm and trie
model_retval = wavTranscriber.load_model(output_graph, lm, trie)
if args.audio is not None:
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']
print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
inference_time = 0.0
# Run VAD on the input file
waveFile = args.audio
segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(waveFile, args.aggressive)
f = open(waveFile.rstrip(".wav") + ".txt", 'w')
logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt")
for i, segment in enumerate(segments):
# Run deepspeech on the chunk that just completed VAD
logging.debug("Processing chunk %002d" % (i,))
audio = np.frombuffer(segment, dtype=np.int16)
output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
inference_time += output[1]
logging.debug("Transcript: %s" % output[0])
f.write(output[0] + " ")
# Summary of the files processed
f.close()
# Extract filename from the full file path
filename, ext = os.path.split(os.path.basename(waveFile))
logging.debug("************************************************************************************************************")
logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
logging.debug("************************************************************************************************************")
print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
else:
sctx = model_retval[0].createStream()
subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
stdout=subprocess.PIPE,
bufsize=0)
print('You can start speaking now. Press Control-C to stop recording.')
try:
while True:
data = subproc.stdout.read(512)
model_retval[0].feedAudioContent(sctx, np.frombuffer(data, np.int16))
except KeyboardInterrupt:
print('Transcription: ', model_retval[0].finishStream(sctx))
subproc.terminate()
subproc.wait()
if __name__ == '__main__':
main(sys.argv[1:])

Просмотреть файл

@ -0,0 +1,388 @@
import sys
import os
import time
import logging
import traceback
import numpy as np
import wavTranscriber
from PyQt5.QtWidgets import *
from PyQt5.QtGui import *
from PyQt5.QtCore import *
import shlex
import subprocess
# Debug helpers
logging.basicConfig(stream=sys.stderr,
level=logging.DEBUG,
format='%(filename)s - %(funcName)s@%(lineno)d %(name)s:%(levelname)s %(message)s')
class WorkerSignals(QObject):
'''
Defines the signals available from a running worker thread.
Supported signals are:
finished:
No data
error
'tuple' (ecxtype, value, traceback.format_exc())
result
'object' data returned from processing, anything
progress
'object' indicating the transcribed result
'''
finished = pyqtSignal()
error = pyqtSignal(tuple)
result = pyqtSignal(object)
progress = pyqtSignal(object)
class Worker(QRunnable):
'''
Worker Thread
Inherits from QRunnable to handle worker thread setup, signals and wrap-up
@param callback:
The funtion callback to run on this worker thread.
Supplied args and kwargs will be passed through the runner.
@type calllback: function
@param args: Arguments to pass to the callback function
@param kwargs: Keywords to pass to the callback function
'''
def __init__(self, fn, *args, **kwargs):
super(Worker, self).__init__()
# Store the conctructor arguments (re-used for processing)
self.fn = fn
self.args = args
self.kwargs = kwargs
self.signals = WorkerSignals()
# Add the callback to our kwargs
self.kwargs['progress_callback'] = self.signals.progress
@pyqtSlot()
def run(self):
'''
Initialise the runner function with the passed args, kwargs
'''
# Retrieve args/kwargs here; and fire up the processing using them
try:
transcript = self.fn(*self.args, **self.kwargs)
except:
traceback.print_exc()
exctype, value = sys.exc_info()[:2]
self.signals.error.emit((exctype, value, traceback.format_exc()))
else:
# Return the result of the processing
self.signals.result.emit(transcript)
finally:
# Done
self.signals.finished.emit()
class App(QMainWindow):
dirName = ""
def __init__(self):
super().__init__()
self.title = 'Deepspeech Transcriber'
self.left = 10
self.top = 10
self.width = 480
self.height = 400
self.initUI()
def initUI(self):
self.setWindowTitle(self.title)
self.setGeometry(self.left, self.top, self.width, self.height)
layout = QGridLayout()
layout.setSpacing(10)
self.microphone = QRadioButton("Microphone")
self.fileUpload = QRadioButton("File Upload")
self.browseBox = QLineEdit(self, placeholderText="Wave File, Mono @ 16 kHz, 16bit Little-Endian")
self.modelsBox = QLineEdit(self, placeholderText="Directory path for output_graph, lm & trie")
self.textboxTranscript = QPlainTextEdit(self, placeholderText="Transcription")
self.browseButton = QPushButton('Browse', self)
self.browseButton.setToolTip('Select a wav file')
self.modelsButton = QPushButton('Browse', self)
self.modelsButton.setToolTip('Select deepspeech models folder')
self.transcribeWav = QPushButton('Transcribe Wav', self)
self.transcribeWav.setToolTip('Start Wav Transcription')
self.openMicrophone = QPushButton('Start Speaking', self)
self.openMicrophone.setToolTip('Open Microphone')
layout.addWidget(self.microphone, 0, 1, 1, 2)
layout.addWidget(self.fileUpload, 0, 3, 1, 2)
layout.addWidget(self.browseBox, 1, 0, 1, 4)
layout.addWidget(self.browseButton, 1, 4)
layout.addWidget(self.modelsBox, 2, 0, 1, 4)
layout.addWidget(self.modelsButton, 2, 4)
layout.addWidget(self.transcribeWav, 3, 1, 1, 1)
layout.addWidget(self.openMicrophone, 3, 3, 1, 1)
layout.addWidget(self.textboxTranscript, 5, 0, -1, 0)
w = QWidget()
w.setLayout(layout)
self.setCentralWidget(w)
# Microphone
self.microphone.clicked.connect(self.mic_activate)
# File Upload
self.fileUpload.clicked.connect(self.wav_activate)
# Connect Browse Button to Function on_click
self.browseButton.clicked.connect(self.browse_on_click)
# Connect the Models Button
self.modelsButton.clicked.connect(self.models_on_click)
# Connect Transcription button to threadpool
self.transcribeWav.clicked.connect(self.transcriptionStart_on_click)
# Connect Microphone button to threadpool
self.openMicrophone.clicked.connect(self.openMicrophone_on_click)
self.openMicrophone.setCheckable(True)
self.openMicrophone.toggle()
self.browseButton.setEnabled(False)
self.browseBox.setEnabled(False)
self.modelsBox.setEnabled(False)
self.modelsButton.setEnabled(False)
self.transcribeWav.setEnabled(False)
self.openMicrophone.setEnabled(False)
self.show()
# Setup Threadpool
self.threadpool = QThreadPool()
logging.debug("Multithreading with maximum %d threads" % self.threadpool.maxThreadCount())
@pyqtSlot()
def mic_activate(self):
logging.debug("Enable streaming widgets")
self.en_mic = True
self.browseButton.setEnabled(False)
self.browseBox.setEnabled(False)
self.modelsBox.setEnabled(True)
self.modelsButton.setEnabled(True)
self.transcribeWav.setEnabled(False)
self.openMicrophone.setStyleSheet('QPushButton {background-color: #70cc7c; color: black;}')
self.openMicrophone.setEnabled(True)
@pyqtSlot()
def wav_activate(self):
logging.debug("Enable wav transcription widgets")
self.en_mic = False
self.openMicrophone.setStyleSheet('QPushButton {background-color: #f7f7f7; color: black;}')
self.openMicrophone.setEnabled(False)
self.browseButton.setEnabled(True)
self.browseBox.setEnabled(True)
self.modelsBox.setEnabled(True)
self.modelsButton.setEnabled(True)
@pyqtSlot()
def browse_on_click(self):
logging.debug('Browse button clicked')
options = QFileDialog.Options()
options |= QFileDialog.DontUseNativeDialog
self.fileName, _ = QFileDialog.getOpenFileName(self, "Select wav file to be Transcribed", "","All Files (*.wav)")
if self.fileName:
self.browseBox.setText(self.fileName)
self.transcribeWav.setEnabled(True)
logging.debug(self.fileName)
@pyqtSlot()
def models_on_click(self):
logging.debug('Models Browse Button clicked')
self.dirName = QFileDialog.getExistingDirectory(self, "Select deepspeech models directory")
if self.dirName:
self.modelsBox.setText(self.dirName)
logging.debug(self.dirName)
# Threaded signal passing worker functions
worker = Worker(self.modelWorker, self.dirName)
worker.signals.result.connect(self.modelResult)
worker.signals.finished.connect(self.modelFinish)
worker.signals.progress.connect(self.modelProgress)
# Execute
self.threadpool.start(worker)
else:
logging.critical("*****************************************************")
logging.critical("Model path not specified..")
logging.critical("*****************************************************")
return "Transcription Failed, models path not specified"
def modelWorker(self, dirName, progress_callback):
self.textboxTranscript.setPlainText("Loading Models...")
self.openMicrophone.setStyleSheet('QPushButton {background-color: #f7f7f7; color: black;}')
self.openMicrophone.setEnabled(False)
self.show()
time.sleep(1)
return dirName
def modelProgress(self, s):
# FixMe: Write code to show progress here
pass
def modelResult(self, dirName):
# Fetch and Resolve all the paths of model files
output_graph, lm, trie = wavTranscriber.resolve_models(dirName)
# Load output_graph, alpahbet, lm and trie
self.model = wavTranscriber.load_model(output_graph, lm, trie)
def modelFinish(self):
# self.timer.stop()
self.textboxTranscript.setPlainText("Loaded Models, start transcribing")
if self.en_mic is True:
self.openMicrophone.setStyleSheet('QPushButton {background-color: #70cc7c; color: black;}')
self.openMicrophone.setEnabled(True)
self.show()
@pyqtSlot()
def transcriptionStart_on_click(self):
logging.debug('Transcription Start button clicked')
# Clear out older data
self.textboxTranscript.setPlainText("")
self.show()
# Threaded signal passing worker functions
worker = Worker(self.wavWorker, self.fileName)
worker.signals.progress.connect(self.progress)
worker.signals.result.connect(self.transcription)
worker.signals.finished.connect(self.wavFinish)
# Execute
self.threadpool.start(worker)
@pyqtSlot()
def openMicrophone_on_click(self):
logging.debug('Preparing to open microphone...')
# Clear out older data
self.textboxTranscript.setPlainText("")
self.show()
# Threaded signal passing worker functions
# Prepare env for capturing from microphone and offload work to micWorker worker thread
if (not self.openMicrophone.isChecked()):
self.openMicrophone.setStyleSheet('QPushButton {background-color: #C60000; color: black;}')
self.openMicrophone.setText("Stop")
logging.debug("Start Recording pressed")
logging.debug("Preparing for transcription...")
sctx = self.model[0].createStream()
subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
stdout=subprocess.PIPE,
bufsize=0)
self.textboxTranscript.insertPlainText('You can start speaking now\n\n')
self.show()
logging.debug('You can start speaking now')
context = (sctx, subproc, self.model[0])
# Pass the state to streaming worker
worker = Worker(self.micWorker, context)
worker.signals.progress.connect(self.progress)
worker.signals.result.connect(self.transcription)
worker.signals.finished.connect(self.micFinish)
# Execute
self.threadpool.start(worker)
else:
logging.debug("Stop Recording")
'''
Capture the audio stream from the microphone.
The context is prepared by the openMicrophone_on_click()
@param Context: Is a tuple containing three objects
1. Speech samples, sctx
2. subprocess handle
3. Deepspeech model object
'''
def micWorker(self, context, progress_callback):
# Deepspeech Streaming will be run from this method
logging.debug("Recording from your microphone")
while (not self.openMicrophone.isChecked()):
data = context[1].stdout.read(512)
context[2].feedAudioContent(context[0], np.frombuffer(data, np.int16))
else:
transcript = context[2].finishStream(context[0])
context[1].terminate()
context[1].wait()
self.show()
progress_callback.emit(transcript)
return "\n*********************\nTranscription Done..."
def micFinish(self):
self.openMicrophone.setText("Start Speaking")
self.openMicrophone.setStyleSheet('QPushButton {background-color: #70cc7c; color: black;}')
def transcription(self, out):
logging.debug("%s" % out)
self.textboxTranscript.insertPlainText(out)
self.show()
def wavFinish(self):
logging.debug("File processed")
def progress(self, chunk):
logging.debug("Progress: %s" % chunk)
self.textboxTranscript.insertPlainText(chunk)
self.show()
def wavWorker(self, waveFile, progress_callback):
# Deepspeech will be run from this method
logging.debug("Preparing for transcription...")
inference_time = 0.0
# Run VAD on the input file
segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(waveFile, 1)
f = open(waveFile.rstrip(".wav") + ".txt", 'w')
logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt")
for i, segment in enumerate(segments):
# Run deepspeech on the chunk that just completed VAD
logging.debug("Processing chunk %002d" % (i,))
audio = np.frombuffer(segment, dtype=np.int16)
output = wavTranscriber.stt(self.model[0], audio, sample_rate)
inference_time += output[1]
f.write(output[0] + " ")
progress_callback.emit(output[0] + " ")
# Summary of the files processed
f.close()
# Format pretty, extract filename from the full file path
filename, ext = os.path.split(os.path.basename(waveFile))
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']
logging.debug("************************************************************************************************************")
logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, self.model[1], self.model[2]))
logging.debug("************************************************************************************************************")
print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, self.model[1], self.model[2]))
return "\n*********************\nTranscription Done..."
def main(args):
app = QApplication(sys.argv)
w = App()
sys.exit(app.exec_())
if __name__ == '__main__':
main(sys.argv[1:])

Просмотреть файл

@ -0,0 +1,3 @@
deepspeech==0.6.0
webrtcvad
pyqt5

23
vad_transcriber/test.sh Executable file
Просмотреть файл

@ -0,0 +1,23 @@
#!/bin/bash
set -xe
THIS=$(dirname "$0")
pushd ${THIS}
source ../tests.sh
pip install --user $(get_python_wheel_url "$1")
pip install --user -r requirements.txt
python audioTranscript_cmd.py \
--audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
--aggressive 0 \
--model $HOME/DeepSpeech/models/
python audioTranscript_cmd.py \
--audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
--aggressive 0 \
--model $HOME/DeepSpeech/models/ \
--stream
popd

134
vad_transcriber/wavSplit.py Normal file
Просмотреть файл

@ -0,0 +1,134 @@
import collections
import contextlib
import wave
def read_wave(path):
"""Reads a .wav file.
Takes the path, and returns (PCM audio data, sample rate).
"""
with contextlib.closing(wave.open(path, 'rb')) as wf:
num_channels = wf.getnchannels()
assert num_channels == 1
sample_width = wf.getsampwidth()
assert sample_width == 2
sample_rate = wf.getframerate()
assert sample_rate in (8000, 16000, 32000)
frames = wf.getnframes()
pcm_data = wf.readframes(frames)
duration = frames / sample_rate
return pcm_data, sample_rate, duration
def write_wave(path, audio, sample_rate):
"""Writes a .wav file.
Takes path, PCM audio data, and sample rate.
"""
with contextlib.closing(wave.open(path, 'wb')) as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(audio)
class Frame(object):
"""Represents a "frame" of audio data."""
def __init__(self, bytes, timestamp, duration):
self.bytes = bytes
self.timestamp = timestamp
self.duration = duration
def frame_generator(frame_duration_ms, audio, sample_rate):
"""Generates audio frames from PCM audio data.
Takes the desired frame duration in milliseconds, the PCM data, and
the sample rate.
Yields Frames of the requested duration.
"""
n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
offset = 0
timestamp = 0.0
duration = (float(n) / sample_rate) / 2.0
while offset + n < len(audio):
yield Frame(audio[offset:offset + n], timestamp, duration)
timestamp += duration
offset += n
def vad_collector(sample_rate, frame_duration_ms,
padding_duration_ms, vad, frames):
"""Filters out non-voiced audio frames.
Given a webrtcvad.Vad and a source of audio frames, yields only
the voiced audio.
Uses a padded, sliding window algorithm over the audio frames.
When more than 90% of the frames in the window are voiced (as
reported by the VAD), the collector triggers and begins yielding
audio frames. Then the collector waits until 90% of the frames in
the window are unvoiced to detrigger.
The window is padded at the front and back to provide a small
amount of silence or the beginnings/endings of speech around the
voiced frames.
Arguments:
sample_rate - The audio sample rate, in Hz.
frame_duration_ms - The frame duration in milliseconds.
padding_duration_ms - The amount to pad the window, in milliseconds.
vad - An instance of webrtcvad.Vad.
frames - a source of audio frames (sequence or generator).
Returns: A generator that yields PCM audio data.
"""
num_padding_frames = int(padding_duration_ms / frame_duration_ms)
# We use a deque for our sliding window/ring buffer.
ring_buffer = collections.deque(maxlen=num_padding_frames)
# We have two states: TRIGGERED and NOTTRIGGERED. We start in the
# NOTTRIGGERED state.
triggered = False
voiced_frames = []
for frame in frames:
is_speech = vad.is_speech(frame.bytes, sample_rate)
if not triggered:
ring_buffer.append((frame, is_speech))
num_voiced = len([f for f, speech in ring_buffer if speech])
# If we're NOTTRIGGERED and more than 90% of the frames in
# the ring buffer are voiced frames, then enter the
# TRIGGERED state.
if num_voiced > 0.9 * ring_buffer.maxlen:
triggered = True
# We want to yield all the audio we see from now until
# we are NOTTRIGGERED, but we have to start with the
# audio that's already in the ring buffer.
for f, s in ring_buffer:
voiced_frames.append(f)
ring_buffer.clear()
else:
# We're in the TRIGGERED state, so collect the audio data
# and add it to the ring buffer.
voiced_frames.append(frame)
ring_buffer.append((frame, is_speech))
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
# If more than 90% of the frames in the ring buffer are
# unvoiced, then enter NOTTRIGGERED and yield whatever
# audio we've collected.
if num_unvoiced > 0.9 * ring_buffer.maxlen:
triggered = False
yield b''.join([f.bytes for f in voiced_frames])
ring_buffer.clear()
voiced_frames = []
if triggered:
pass
# If we have any leftover voiced audio when we run out of input,
# yield it.
if voiced_frames:
yield b''.join([f.bytes for f in voiced_frames])

Просмотреть файл

@ -0,0 +1,97 @@
import glob
import webrtcvad
import logging
import wavSplit
from deepspeech import Model
from timeit import default_timer as timer
'''
Load the pre-trained model into the memory
@param models: Output Grapgh Protocol Buffer file
@param lm: Language model file
@param trie: Trie file
@Retval
Returns a list [DeepSpeech Object, Model Load Time, LM Load Time]
'''
def load_model(models, lm, trie):
BEAM_WIDTH = 500
LM_ALPHA = 0.75
LM_BETA = 1.85
model_load_start = timer()
ds = Model(models, BEAM_WIDTH)
model_load_end = timer() - model_load_start
logging.debug("Loaded model in %0.3fs." % (model_load_end))
lm_load_start = timer()
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
lm_load_end = timer() - lm_load_start
logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))
return [ds, model_load_end, lm_load_end]
'''
Run Inference on input audio file
@param ds: Deepspeech object
@param audio: Input audio for running inference on
@param fs: Sample rate of the input audio file
@Retval:
Returns a list [Inference, Inference Time, Audio Length]
'''
def stt(ds, audio, fs):
inference_time = 0.0
audio_length = len(audio) * (1 / fs)
# Run Deepspeech
logging.debug('Running inference...')
inference_start = timer()
output = ds.stt(audio)
inference_end = timer() - inference_start
inference_time += inference_end
logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length))
return [output, inference_time]
'''
Resolve directory path for the models and fetch each of them.
@param dirName: Path to the directory containing pre-trained models
@Retval:
Retunns a tuple containing each of the model files (pb, lm and trie)
'''
def resolve_models(dirName):
pb = glob.glob(dirName + "/*.pb")[0]
logging.debug("Found Model: %s" % pb)
lm = glob.glob(dirName + "/lm.binary")[0]
trie = glob.glob(dirName + "/trie")[0]
logging.debug("Found Language Model: %s" % lm)
logging.debug("Found Trie: %s" % trie)
return pb, lm, trie
'''
Generate VAD segments. Filters out non-voiced audio frames.
@param waveFile: Input wav file to run VAD on.0
@Retval:
Returns tuple of
segments: a bytearray of multiple smaller audio frames
(The longer audio split into mutiple smaller one's)
sample_rate: Sample rate of the input audio file
audio_length: Duraton of the input audio file
'''
def vad_segment_generator(wavFile, aggressiveness):
logging.debug("Caught the wav file @: %s" % (wavFile))
audio, sample_rate, audio_length = wavSplit.read_wave(wavFile)
assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!"
vad = webrtcvad.Vad(int(aggressiveness))
frames = wavSplit.frame_generator(30, audio, sample_rate)
frames = list(frames)
segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames)
return segments, sample_rate, audio_length

Просмотреть файл

@ -0,0 +1,107 @@
## Transcribing longer audio clips
The Command and GUI tools perform transcription on long wav files.
They take in a wav file of any duration, use the WebRTC Voice Activity Detector (VAD)
to split it into smaller chunks and finally save a consolidated transcript.
### 0. Prerequisites
#### 0.1 Install requiered packages
Install the package which contains rec on the machine:
Fedora:
``` sudo dnf install sox ```
Tested on: 29
Ubuntu/Debian
``` sudo apt install sox ```
A list of distributions where the package is available can be found at: https://pkgs.org/download/sox
#### 0.1 Download Deepspeech
Either clone from git via git clone, or Download a version from the release page
For the next steps we assume you have extracted the files to ~/Deepspeech
#### 0.2 Setup your environment
Ubuntu/Debian:
```
~/Deepspeech$ sudo apt install virtualenv
~/Deepspeech$ cd examples/vad_transcriber
~/Deepspeech/examples/vad_transcriber$ virtualenv -p python3 venv
~/Deepspeech/examples/vad_transcriber$ source venv/bin/activate
(venv) ~/Deepspeech/examples/vad_transcriber$ pip3 install -r requirements.txt
```
Fedora
```
~/Deepspeech$ sudo dnf install python-virtualenv
~/Deepspeech$ cd examples/vad_transcriber
~/Deepspeech/examples/vad_transcriber$ virtualenv -p python3 venv
~/Deepspeech/examples/vad_transcriber$ source venv/bin/activate
(venv) ~/Deepspeech/examples/vad_transcriber$ pip3 install -r requirements.txt
```
Tested on: 29
### 1. Command line tool
The command line tool processes a wav file of any duration and returns a trancript
which will the saved in the same directory as the input audio file.
The command line tool gives you control over the aggressiveness of the VAD.
Set the aggressiveness mode, to an integer between 0 and 3.
0 being the least aggressive about filtering out non-speech, 3 is the most aggressive.
```
(venv) ~/Deepspeech/examples/vad_transcriber
$ python3 audioTranscript_cmd.py --aggressive 1 --audio ./audio/guido-van-rossum.wav --model ./models/0.4.1/
Filename Duration(s) Inference Time(s) Model Load Time(s) LM Load Time(s)
sample_rec.wav 13.710 20.797 5.593 17.742
```
**Note:** Only `wav` files with a 16kHz sample rate are supported for now, you can convert your files to the appropriate format with ffmpeg if available on your system.
ffmpeg -i infile.mp3 -ar 16000 -ac 1 outfile.wav
### 2. Minimalistic GUI
The GUI tool does the same job as the CLI tool. The VAD is fixed at an aggressiveness of 1.
The output is displayed in the transcription window and saved into the directory as the input
audio file as well.
```
(venv) ~/Deepspeech/examples/vad_transcriber
$ python3 audioTranscript_gui.py
```
![Deepspeech Transcriber](../../doc/audioTranscript.png)
#### 2.1. Sporadic failures in pyqt
Some systems have encountered **_Cannot mix incompatible Qt library with this with this library_** issue.
In such a scenario, the GUI tool will not work. The following steps is known to have solved the issue in most cases
```
(venv) ~/Deepspeech/examples/vad_transcriber$ pip3 uninstall pyqt5
(venv) ~/Deepspeech/examples/vad_transcriber$ sudo apt install python3-pyqt5 canberra-gtk-module
(venv) ~/Deepspeech/examples/vad_transcriber$ export PYTHONPATH=/usr/lib/python3/dist-packages/
(venv) ~/Deepspeech/examples/vad_transcriber$ python3 audioTranscript_gui.py
```
#### 2.2 Useful Tips
##### The GUI programm immediately crashes when you press start recording
This happens when you don't load the models via the "Browse Models" button, before pressing the "Start recording" button.
##### What does error XYZ mean?
You can find a list of error codes and what they mean at https://deepspeech.readthedocs.io/en/latest/Error-Codes.html