Rename DeepSpeech to Mozilla Voice STT
20
README.rst
|
@ -1,17 +1,19 @@
|
||||||
DeepSpeech master Examples
|
Mozilla Voice STT master Examples
|
||||||
==========================
|
=================================
|
||||||
|
|
||||||
These are various examples on how to use or integrate DeepSpeech using our packages.
|
These are various user-contributed examples on how to use or integrate Mozilla Voice STT using our packages.
|
||||||
|
|
||||||
It is a good way to just try out DeepSpeech before learning how it works in detail, as well as a source of inspiration for ways you can integrate it into your application or solve common tasks like voice activity detection (VAD) or microphone streaming.
|
It is a good way to just try out Mozilla Voice STT before learning how it works in detail, as well as a source of inspiration for ways you can integrate it into your application or solve common tasks like voice activity detection (VAD) or microphone streaming.
|
||||||
|
|
||||||
Contributions are welcome!
|
Please understand that those examples are provided as-is, with no guarantee it will work in every configuration.
|
||||||
|
|
||||||
**Note:** These examples target DeepSpeech **master branch** only. If you're using a different release, you need to go to the corresponding branch for the release:
|
Contributions like fixes to existing examples or new ones are welcome!
|
||||||
|
|
||||||
* `v0.7.x <https://github.com/mozilla/DeepSpeech-examples/tree/r0.7>`_
|
**Note:** These examples target Mozilla Voice STT **master branch** only. If you're using a different release, you need to go to the corresponding branch for the release:
|
||||||
* `v0.6.x <https://github.com/mozilla/DeepSpeech-examples/tree/r0.6>`_
|
|
||||||
* `master branch <https://github.com/mozilla/DeepSpeech-examples/tree/master>`_
|
* `v0.7.x <https://github.com/mozilla/STT-examples/tree/r0.7>`_
|
||||||
|
* `v0.6.x <https://github.com/mozilla/STT-examples/tree/r0.6>`_
|
||||||
|
* `master branch <https://github.com/mozilla/STT-examples/tree/master>`_
|
||||||
|
|
||||||
**List of examples**
|
**List of examples**
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# Android Microphone Streaming
|
# Android Microphone Streaming
|
||||||
|
|
||||||
Android demo application that streams audio from the microphone to deepspeech and transcribes it.
|
Android demo application that streams audio from the microphone to mozilla voice stt and transcribes it.
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ Move the model files `deepspeech-0.8.0-models.pbmm`, `deepspeech-0.8.0-models.sc
|
||||||
Mind that the data directory will only be present after installing and launching the app once.
|
Mind that the data directory will only be present after installing and launching the app once.
|
||||||
|
|
||||||
```
|
```
|
||||||
adb push deepspeech-0.8.0-models.tflite deepspeech-0.8.0-models.scorer /storage/emulated/0/Android/data/org.deepspeechdemo/files/
|
adb push deepspeech-0.8.0-models.tflite deepspeech-0.8.0-models.scorer /storage/emulated/0/Android/data/org.mozilla.voice.sttdemo/files/
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also copy the files from your file browser to the device.
|
You can also copy the files from your file browser to the device.
|
||||||
|
|
|
@ -8,7 +8,7 @@ android {
|
||||||
compileSdkVersion 29
|
compileSdkVersion 29
|
||||||
buildToolsVersion "29.0.2"
|
buildToolsVersion "29.0.2"
|
||||||
defaultConfig {
|
defaultConfig {
|
||||||
applicationId "org.deepspeechdemo"
|
applicationId "org.mozilla.voice.sttdemo"
|
||||||
minSdkVersion 22
|
minSdkVersion 22
|
||||||
targetSdkVersion 29
|
targetSdkVersion 29
|
||||||
versionCode 1
|
versionCode 1
|
||||||
|
@ -34,7 +34,7 @@ dependencies {
|
||||||
implementation 'androidx.core:core-ktx:1.0.2'
|
implementation 'androidx.core:core-ktx:1.0.2'
|
||||||
implementation 'androidx.constraintlayout:constraintlayout:1.1.3'
|
implementation 'androidx.constraintlayout:constraintlayout:1.1.3'
|
||||||
|
|
||||||
implementation 'org.mozilla.deepspeech:libdeepspeech:0.8.0'
|
implementation 'org.mozilla.voice:stt:0.9.0-alpha.5'
|
||||||
|
|
||||||
testImplementation 'junit:junit:4.12'
|
testImplementation 'junit:junit:4.12'
|
||||||
androidTestImplementation 'androidx.test.ext:junit:1.1.0'
|
androidTestImplementation 'androidx.test.ext:junit:1.1.0'
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
|
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
|
||||||
package="org.deepspeechdemo">
|
package="org.mozilla.voice.sttdemo">
|
||||||
|
|
||||||
<uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
|
<uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
|
||||||
<uses-permission android:name="android.permission.RECORD_AUDIO" />
|
<uses-permission android:name="android.permission.RECORD_AUDIO" />
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
package org.deepspeechdemo
|
package org.mozilla.voice.sttdemo
|
||||||
|
|
||||||
import android.Manifest
|
import android.Manifest
|
||||||
import android.content.pm.PackageManager
|
import android.content.pm.PackageManager
|
||||||
|
@ -11,14 +11,14 @@ import android.view.View
|
||||||
import androidx.appcompat.app.AppCompatActivity
|
import androidx.appcompat.app.AppCompatActivity
|
||||||
import androidx.core.app.ActivityCompat
|
import androidx.core.app.ActivityCompat
|
||||||
import kotlinx.android.synthetic.main.activity_main.*
|
import kotlinx.android.synthetic.main.activity_main.*
|
||||||
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel
|
import org.mozilla.voice.stt.MozillaVoiceSttModel.MozillaVoiceSttModel
|
||||||
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechStreamingState
|
import org.mozilla.voice.stt.MozillaVoiceSttModel.MozillaVoiceSttStreamingState
|
||||||
import java.io.File
|
import java.io.File
|
||||||
|
|
||||||
|
|
||||||
class MainActivity : AppCompatActivity() {
|
class MainActivity : AppCompatActivity() {
|
||||||
private var model: DeepSpeechModel? = null
|
private var model: MozillaVoiceSttModel? = null
|
||||||
private var streamContext: DeepSpeechStreamingState? = null
|
private var streamContext: MozillaVoiceSttStreamingState? = null
|
||||||
|
|
||||||
// Change the following parameters regarding
|
// Change the following parameters regarding
|
||||||
// what works best for your use case or your language.
|
// what works best for your use case or your language.
|
||||||
|
@ -77,7 +77,7 @@ class MainActivity : AppCompatActivity() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
model = DeepSpeechModel(tfliteModelPath)
|
model = mozillaVoiceSttModel(tfliteModelPath)
|
||||||
model?.setBeamWidth(BEAM_WIDTH)
|
model?.setBeamWidth(BEAM_WIDTH)
|
||||||
model?.enableExternalScorer(scorerPath)
|
model?.enableExternalScorer(scorerPath)
|
||||||
model?.setScorerAlphaBeta(LM_ALPHA, LM_BETA)
|
model?.setScorerAlphaBeta(LM_ALPHA, LM_BETA)
|
|
@ -1,3 +1,3 @@
|
||||||
<resources>
|
<resources>
|
||||||
<string name="app_name">DeepSpeech Demo</string>
|
<string name="app_name">Mozilla Voice STT Demo</string>
|
||||||
</resources>
|
</resources>
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
include ':app'
|
include ':app'
|
||||||
rootProject.name='DeepSpeechDemo'
|
rootProject.name='MozillaVoiceSttDemo'
|
||||||
|
|
|
@ -134,7 +134,7 @@ Running via the GPU takes half the time of using the CPU and has good results.
|
||||||
|
|
||||||
It will then run the individual commands like :
|
It will then run the individual commands like :
|
||||||
|
|
||||||
`deepspeech --model C:\Users\jmike\Documents\GitHub\DeepSpeech\deepspeech-0.8.0-models.pbmm --scorer C:\Users\jmike\Documents\GitHub\DeepSpeech\deepspeech-0.8.0-models.scorer --audio 'C:\Users\jmike\Downloads\podcast\45374977-48000-2-24d9a365625bb.mp3.wav' --json`
|
`mozilla_voice_stt --model C:\Users\jmike\Documents\GitHub\DeepSpeech\deepspeech-0.8.0-models.pbmm --scorer C:\Users\jmike\Documents\GitHub\DeepSpeech\deepspeech-0.8.0-models.scorer --audio 'C:\Users\jmike\Downloads\podcast\45374977-48000-2-24d9a365625bb.mp3.wav' --json`
|
||||||
|
|
||||||
|
|
||||||
Websites referenced:
|
Websites referenced:
|
||||||
|
|
|
@ -61,7 +61,7 @@ def main(dirname, ext, model, scorer):
|
||||||
|
|
||||||
command = " ".join(
|
command = " ".join(
|
||||||
[
|
[
|
||||||
"deepspeech",
|
"mozilla_voice_stt",
|
||||||
"--model",
|
"--model",
|
||||||
model,
|
model,
|
||||||
"--scorer",
|
"--scorer",
|
||||||
|
|
|
@ -10,7 +10,7 @@ cachetools==4.1.0
|
||||||
certifi==2020.4.5.2
|
certifi==2020.4.5.2
|
||||||
chardet==3.0.4
|
chardet==3.0.4
|
||||||
click==7.1.2
|
click==7.1.2
|
||||||
deepspeech==0.8.0
|
mozilla_voice_stt==0.9.0a5
|
||||||
delegator.py @ git+https://github.com/amitt001/delegator.py.git@194aa92543fbdbfbae0bcc24ca217819a7805da2
|
delegator.py @ git+https://github.com/amitt001/delegator.py.git@194aa92543fbdbfbae0bcc24ca217819a7805da2
|
||||||
flask==1.1.2
|
flask==1.1.2
|
||||||
gast==0.2.2
|
gast==0.2.2
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
deepspeech --model C:\Users\jmike\Documents\GitHub\DeepSpeech\deepspeech-0.7.3-models.pbmm --scorer C:\Users\jmike\Documents\GitHub\DeepSpeech\deepspeech-0.7.3-models.scorer --audio C:\Users\jmike\Documents\Audacity\clip.wav --json
|
mozilla_voice_stt --model C:\Users\jmike\Documents\GitHub\DeepSpeech\deepspeech-0.7.3-models.pbmm --scorer C:\Users\jmike\Documents\GitHub\DeepSpeech\deepspeech-0.7.3-models.scorer --audio C:\Users\jmike\Documents\Audacity\clip.wav --json
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# DeepSpeech Electron example
|
# Mozilla Voice STT Electron example
|
||||||
|
|
||||||
This is an example of DeepSpeech running in an Electron app with a ReactJS front-end and processing .wav files.
|
This is an example of Mozilla Voice STT running in an Electron app with a ReactJS front-end and processing .wav files.
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
|
||||||
|
@ -66,6 +66,6 @@ Test the (dmg/appimage/exe) package file that has been generated in `/dist`.
|
||||||
|
|
||||||
The model files download to the following directories and must be deleted manually
|
The model files download to the following directories and must be deleted manually
|
||||||
|
|
||||||
- MacOSX: `~/Library/Application\ Support/deepspeech-electron`
|
- MacOSX: `~/Library/Application\ Support/mozilla_voice_stt-electron`
|
||||||
- Linux: `~/.config/deepspeech-electron`
|
- Linux: `~/.config/mozilla_voice_stt-electron`
|
||||||
- Windows: `~/AppData/Roaming/deepspeech-electron`
|
- Windows: `~/AppData/Roaming/mozilla_voice_stt-electron`
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"name": "deepspeech-electron",
|
"name": "mozilla_voice_stt-electron",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"lockfileVersion": 1,
|
"lockfileVersion": 1,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
|
@ -1384,6 +1384,18 @@
|
||||||
"@types/yargs": "^13.0.0"
|
"@types/yargs": "^13.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"@mozilla-voice/stt": {
|
||||||
|
"version": "0.9.0-alpha.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@mozilla-voice/stt/-/stt-0.9.0-alpha.5.tgz",
|
||||||
|
"integrity": "sha512-lyZmMnLKdmBzWonDazIvkbnyAlIXd5NDUaINf5wfOdsw4Rliv/hy/FjYdYN9Tccq4Zvcd+dbqgXGLeZuECGmIg==",
|
||||||
|
"requires": {
|
||||||
|
"argparse": "1.0.x",
|
||||||
|
"memory-stream": "1.0.x",
|
||||||
|
"node-pre-gyp": "0.15.x",
|
||||||
|
"node-wav": "0.0.2",
|
||||||
|
"sox-stream": "2.0.x"
|
||||||
|
}
|
||||||
|
},
|
||||||
"@mrmlnc/readdir-enhanced": {
|
"@mrmlnc/readdir-enhanced": {
|
||||||
"version": "2.2.1",
|
"version": "2.2.1",
|
||||||
"resolved": "https://registry.npmjs.org/@mrmlnc/readdir-enhanced/-/readdir-enhanced-2.2.1.tgz",
|
"resolved": "https://registry.npmjs.org/@mrmlnc/readdir-enhanced/-/readdir-enhanced-2.2.1.tgz",
|
||||||
|
@ -4790,18 +4802,6 @@
|
||||||
"resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.3.tgz",
|
"resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.3.tgz",
|
||||||
"integrity": "sha1-s2nW+128E+7PUk+RsHD+7cNXzzQ="
|
"integrity": "sha1-s2nW+128E+7PUk+RsHD+7cNXzzQ="
|
||||||
},
|
},
|
||||||
"deepspeech": {
|
|
||||||
"version": "0.8.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/deepspeech/-/deepspeech-0.8.0.tgz",
|
|
||||||
"integrity": "sha512-jqU+NbXVZnS+okMgoiOhJz22RaHSmvIjmHaRu7IZ0xBDQbcqNGff4GXk4a5etfSXm3bXddRtBlfFr5KyQExjbw==",
|
|
||||||
"requires": {
|
|
||||||
"argparse": "1.0.x",
|
|
||||||
"memory-stream": "1.0.x",
|
|
||||||
"node-pre-gyp": "0.15.x",
|
|
||||||
"node-wav": "0.0.2",
|
|
||||||
"sox-stream": "2.0.x"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"default-gateway": {
|
"default-gateway": {
|
||||||
"version": "4.2.0",
|
"version": "4.2.0",
|
||||||
"resolved": "https://registry.npmjs.org/default-gateway/-/default-gateway-4.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/default-gateway/-/default-gateway-4.2.0.tgz",
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "deepspeech-electron",
|
"name": "mozilla_voice_stt-electron",
|
||||||
"productName": "deepspeech-electron",
|
"productName": "mozilla_voice_stt-electron",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"description": "My Electron application description",
|
"description": "My Electron application description",
|
||||||
"main": "public/electron.js",
|
"main": "public/electron.js",
|
||||||
|
@ -20,15 +20,15 @@
|
||||||
"postinstall": "electron-builder install-app-deps",
|
"postinstall": "electron-builder install-app-deps",
|
||||||
"homepage": "./",
|
"homepage": "./",
|
||||||
"build": {
|
"build": {
|
||||||
"appId": "deepspeech-electron",
|
"appId": "mozilla_voice_stt-electron",
|
||||||
"productName": "deepspeech-electron",
|
"productName": "mozilla_voice_stt-electron",
|
||||||
"files": [
|
"files": [
|
||||||
"build/**/*",
|
"build/**/*",
|
||||||
"node_modules/**/*",
|
"node_modules/**/*",
|
||||||
"package.json"
|
"package.json"
|
||||||
],
|
],
|
||||||
"buildDependenciesFromSource": true,
|
"buildDependenciesFromSource": true,
|
||||||
"artifactName": "deepspeech-electron-${version}-${os}-${arch}.${ext}",
|
"artifactName": "mozilla_voice_stt-electron-${version}-${os}-${arch}.${ext}",
|
||||||
"dmg": {
|
"dmg": {
|
||||||
"title": "${productName}"
|
"title": "${productName}"
|
||||||
},
|
},
|
||||||
|
@ -52,7 +52,7 @@
|
||||||
},
|
},
|
||||||
"win": {
|
"win": {
|
||||||
"target": "nsis",
|
"target": "nsis",
|
||||||
"artifactName": "deepspeech-electron-${version}-${os}-${arch}.${ext}"
|
"artifactName": "mozilla_voice_stt-electron-${version}-${os}-${arch}.${ext}"
|
||||||
},
|
},
|
||||||
"linux": {
|
"linux": {
|
||||||
"target": [
|
"target": [
|
||||||
|
@ -66,7 +66,7 @@
|
||||||
"keywords": [],
|
"keywords": [],
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"deepspeech": "^0.8.0",
|
"@mozilla-voice/stt": "^0.9.0-alpha.5",
|
||||||
"electron-is-dev": "^1.1.0",
|
"electron-is-dev": "^1.1.0",
|
||||||
"lodash": "^4.17.15",
|
"lodash": "^4.17.15",
|
||||||
"node-abi": "^2.18.0",
|
"node-abi": "^2.18.0",
|
||||||
|
|
|
@ -35,7 +35,7 @@ function createWindow(model) {
|
||||||
app.quit()
|
app.quit()
|
||||||
});
|
});
|
||||||
|
|
||||||
// message from front-end App.js, request that this file be processed by DeepSpeech
|
// message from front-end App.js, request that this file be processed by Mozilla Voice STT
|
||||||
ipcMain.handle('recognize-wav', async function (event, file) {
|
ipcMain.handle('recognize-wav', async function (event, file) {
|
||||||
const filePath = path.resolve(__dirname, 'audio', file);
|
const filePath = path.resolve(__dirname, 'audio', file);
|
||||||
const results = await recognizeWav(filePath, model);
|
const results = await recognizeWav(filePath, model);
|
||||||
|
|
|
@ -8,12 +8,12 @@ const {getModel} = require('./recognize-wav');
|
||||||
let appDataPath;
|
let appDataPath;
|
||||||
|
|
||||||
if (fs.existsSync(path.resolve(__dirname, '../models/deepspeech-0.8.0-models.pbmm'))) {
|
if (fs.existsSync(path.resolve(__dirname, '../models/deepspeech-0.8.0-models.pbmm'))) {
|
||||||
// if the deepspeech model was found at the root, use that directory
|
// if the model was found at the root, use that directory
|
||||||
appDataPath = path.resolve(__dirname, '../models');
|
appDataPath = path.resolve(__dirname, '../models');
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// otherwise use the electron "appData" path
|
// otherwise use the electron "appData" path
|
||||||
appDataPath = path.resolve(electron.app.getPath('appData'), 'deepspeech-electron');
|
appDataPath = path.resolve(electron.app.getPath('appData'), 'mozilla_voice_stt-electron');
|
||||||
}
|
}
|
||||||
|
|
||||||
app.on('ready', function () {
|
app.on('ready', function () {
|
||||||
|
|
|
@ -25,7 +25,7 @@
|
||||||
Learn how to configure a non-root public URL by running `npm run build`.
|
Learn how to configure a non-root public URL by running `npm run build`.
|
||||||
-->
|
-->
|
||||||
<link rel="stylesheet" href="fonts/stylesheet.css" type="text/css" charset="utf-8" />
|
<link rel="stylesheet" href="fonts/stylesheet.css" type="text/css" charset="utf-8" />
|
||||||
<title>DeepSpeech Electron Example</title>
|
<title>Mozilla Voice STT Electron Example</title>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<noscript>You need to enable JavaScript to run this app.</noscript>
|
<noscript>You need to enable JavaScript to run this app.</noscript>
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
const DeepSpeech = require('deepspeech');
|
const mozillaVoiceStt = require('@mozilla-voice/stt');
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
const path = require('path');
|
const path = require('path');
|
||||||
const wav = require('wav');
|
const wav = require('wav');
|
||||||
const download = require('./download');
|
const download = require('./download');
|
||||||
|
|
||||||
// return the deepspeech model or download it if it is not found
|
// return the model or download it if it is not found
|
||||||
function getModel(appDataPath, callback) {
|
function getModel(appDataPath, callback) {
|
||||||
let modelPath = path.resolve(appDataPath, 'deepspeech-0.8.0-models.pbmm');
|
let modelPath = path.resolve(appDataPath, 'deepspeech-0.8.0-models.pbmm');
|
||||||
let scorerPath = path.resolve(appDataPath, 'deepspeech-0.8.0-models.scorer');
|
let scorerPath = path.resolve(appDataPath, 'deepspeech-0.8.0-models.scorer');
|
||||||
|
@ -23,14 +23,14 @@ function getModel(appDataPath, callback) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// create the deepspeech model
|
// create the model
|
||||||
function createModel(modelPath, scorerPath) {
|
function createModel(modelPath, scorerPath) {
|
||||||
const model = new DeepSpeech.Model(modelPath);
|
const model = new mozillaVoiceStt.Model(modelPath);
|
||||||
model.enableExternalScorer(scorerPath);
|
model.enableExternalScorer(scorerPath);
|
||||||
return model;
|
return model;
|
||||||
}
|
}
|
||||||
|
|
||||||
// create a deepspeech stream to process a .wav file
|
// create a stream to process a .wav file
|
||||||
function recognizeWav(path, model) {
|
function recognizeWav(path, model) {
|
||||||
return new Promise(function(resolve, reject) {
|
return new Promise(function(resolve, reject) {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -21,7 +21,7 @@ class App extends Component {
|
||||||
files
|
files
|
||||||
}, () => {
|
}, () => {
|
||||||
files.forEach(file => {
|
files.forEach(file => {
|
||||||
// request that each file be processed by deepspeech
|
// request that each file be processed by mozilla voice stt
|
||||||
console.log('recognize', file);
|
console.log('recognize', file);
|
||||||
window.ipcRenderer.invoke('recognize-wav', file).then(result => {
|
window.ipcRenderer.invoke('recognize-wav', file).then(result => {
|
||||||
// add the recognition results to this.state.results
|
// add the recognition results to this.state.results
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# FFmpeg VAD Streaming
|
# FFmpeg VAD Streaming
|
||||||
|
|
||||||
Streaming inference from arbitrary source (FFmpeg input) to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Node.js.
|
Streaming inference from arbitrary source (FFmpeg input) to Mozilla Voice STT, using VAD (voice activity detection). A fairly simple example demonstrating the Mozilla Voice STT streaming API in Node.js.
|
||||||
|
|
||||||
This example was successfully tested with a mobile phone streaming a live feed to a RTMP server (nginx-rtmp), which then could be used by this script for near real time speech recognition.
|
This example was successfully tested with a mobile phone streaming a live feed to a RTMP server (nginx-rtmp), which then could be used by this script for near real time speech recognition.
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ node ./index.js --audio rtmp://<IP>:1935/live/teststream \
|
||||||
```
|
```
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
Real time streaming inference with DeepSpeech's example audio ([audio-0.4.1.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz)).
|
Real time streaming inference with Mozilla Voice STT's example audio ([audio-0.4.1.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz)).
|
||||||
```bash
|
```bash
|
||||||
node ./index.js --audio $HOME/audio/2830-3980-0043.wav \
|
node ./index.js --audio $HOME/audio/2830-3980-0043.wav \
|
||||||
--scorer $HOME/models/kenlm.scorer \
|
--scorer $HOME/models/kenlm.scorer \
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/env node
|
#!/usr/bin/env node
|
||||||
|
|
||||||
const VAD = require("node-vad");
|
const VAD = require("node-vad");
|
||||||
const Ds = require('deepspeech');
|
const mVS = require('@mozilla-voice/stt');
|
||||||
const argparse = require('argparse');
|
const argparse = require('argparse');
|
||||||
const util = require('util');
|
const util = require('util');
|
||||||
const { spawn } = require('child_process');
|
const { spawn } = require('child_process');
|
||||||
|
@ -15,11 +15,11 @@ let VersionAction = function VersionAction(options) {
|
||||||
util.inherits(VersionAction, argparse.Action);
|
util.inherits(VersionAction, argparse.Action);
|
||||||
|
|
||||||
VersionAction.prototype.call = function(parser) {
|
VersionAction.prototype.call = function(parser) {
|
||||||
Ds.printVersions();
|
mVS.printVersions();
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
};
|
};
|
||||||
|
|
||||||
let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
|
let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running Mozilla Voice STT inference.'});
|
||||||
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
|
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
|
||||||
parser.addArgument(['--scorer'], {help: 'Path to the scorer file', nargs: '?'});
|
parser.addArgument(['--scorer'], {help: 'Path to the scorer file', nargs: '?'});
|
||||||
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
|
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
|
||||||
|
@ -32,7 +32,7 @@ function totalTime(hrtimeValue) {
|
||||||
|
|
||||||
console.error('Loading model from file %s', args['model']);
|
console.error('Loading model from file %s', args['model']);
|
||||||
const model_load_start = process.hrtime();
|
const model_load_start = process.hrtime();
|
||||||
let model = new Ds.Model(args['model']);
|
let model = new mVS.Model(args['model']);
|
||||||
const model_load_end = process.hrtime(model_load_start);
|
const model_load_end = process.hrtime(model_load_start);
|
||||||
console.error('Loaded model in %ds.', totalTime(model_load_end));
|
console.error('Loaded model in %ds.', totalTime(model_load_end));
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"argparse": "^1.0.10",
|
"argparse": "^1.0.10",
|
||||||
"deepspeech": "0.8.0",
|
"@mozilla-voice/stt": "0.9.0-alpha.5",
|
||||||
"node-vad": "^1.1.1",
|
"node-vad": "^1.1.1",
|
||||||
"util": "^0.11.1"
|
"util": "^0.11.1"
|
||||||
},
|
},
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
|
|
||||||
Microphone VAD Streaming
|
Microphone VAD Streaming
|
||||||
========================
|
========================
|
||||||
|
|
||||||
Stream from microphone to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Python. Also useful for quick, real-time testing of models and decoding parameters.
|
Stream from microphone to Mozilla Voice STT, using VAD (voice activity detection). A fairly simple example demonstrating the Mozilla Voice STT streaming API in Python. Also useful for quick, real-time testing of models and decoding parameters.
|
||||||
|
|
||||||
Installation
|
Installation
|
||||||
------------
|
------------
|
||||||
|
@ -32,7 +31,7 @@ Usage
|
||||||
[-w SAVEWAV] [-f FILE] -m MODEL [-s SCORER]
|
[-w SAVEWAV] [-f FILE] -m MODEL [-s SCORER]
|
||||||
[-d DEVICE] [-r RATE]
|
[-d DEVICE] [-r RATE]
|
||||||
|
|
||||||
Stream from microphone to DeepSpeech using VAD
|
Stream from microphone to Mozilla Voice STT using VAD
|
||||||
|
|
||||||
optional arguments:
|
optional arguments:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
|
|
@ -60,7 +60,7 @@ class Audio(object):
|
||||||
"""
|
"""
|
||||||
Microphone may not support our native processing sampling rate, so
|
Microphone may not support our native processing sampling rate, so
|
||||||
resample from input_rate to RATE_PROCESS here for webrtcvad and
|
resample from input_rate to RATE_PROCESS here for webrtcvad and
|
||||||
deepspeech
|
mozilla_voice_stt
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
data (binary): Input audio stream
|
data (binary): Input audio stream
|
||||||
|
@ -152,7 +152,7 @@ class VADAudio(Audio):
|
||||||
ring_buffer.clear()
|
ring_buffer.clear()
|
||||||
|
|
||||||
def main(ARGS):
|
def main(ARGS):
|
||||||
# Load DeepSpeech model
|
# Load model
|
||||||
if os.path.isdir(ARGS.model):
|
if os.path.isdir(ARGS.model):
|
||||||
model_dir = ARGS.model
|
model_dir = ARGS.model
|
||||||
ARGS.model = os.path.join(model_dir, 'output_graph.pb')
|
ARGS.model = os.path.join(model_dir, 'output_graph.pb')
|
||||||
|
@ -173,7 +173,7 @@ def main(ARGS):
|
||||||
print("Listening (ctrl-C to exit)...")
|
print("Listening (ctrl-C to exit)...")
|
||||||
frames = vad_audio.vad_collector()
|
frames = vad_audio.vad_collector()
|
||||||
|
|
||||||
# Stream from microphone to DeepSpeech using VAD
|
# Stream from microphone to Mozilla Voice STT using VAD
|
||||||
spinner = None
|
spinner = None
|
||||||
if not ARGS.nospinner:
|
if not ARGS.nospinner:
|
||||||
spinner = Halo(spinner='line')
|
spinner = Halo(spinner='line')
|
||||||
|
@ -199,7 +199,7 @@ if __name__ == '__main__':
|
||||||
DEFAULT_SAMPLE_RATE = 16000
|
DEFAULT_SAMPLE_RATE = 16000
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
|
parser = argparse.ArgumentParser(description="Stream from microphone to Mozilla Voice STT using VAD")
|
||||||
|
|
||||||
parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
|
parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
|
||||||
help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
|
help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
deepspeech~=0.8.0
|
mozilla_voice_stt~=0.9.0a5
|
||||||
pyaudio~=0.2.11
|
pyaudio~=0.2.11
|
||||||
webrtcvad~=2.0.10
|
webrtcvad~=2.0.10
|
||||||
halo~=0.0.18
|
halo~=0.0.18
|
||||||
|
|
|
@ -8,7 +8,7 @@ pushd ${THIS}
|
||||||
source ../tests.sh
|
source ../tests.sh
|
||||||
|
|
||||||
pip install --user $(get_python_wheel_url "$1")
|
pip install --user $(get_python_wheel_url "$1")
|
||||||
pip install --user -r <(grep -v deepspeech requirements.txt)
|
pip install --user -r <(grep -v mozilla_voice_stt requirements.txt)
|
||||||
|
|
||||||
pulseaudio &
|
pulseaudio &
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
<Application
|
<Application
|
||||||
x:Class="DeepSpeechWPF.App"
|
x:Class="MozillaVoiceSttWPF.App"
|
||||||
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
|
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
|
||||||
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
|
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
|
||||||
xmlns:local="clr-namespace:DeepSpeechWPF"
|
xmlns:local="clr-namespace:MozillaVoiceSttWPF"
|
||||||
StartupUri="MainWindow.xaml">
|
StartupUri="MainWindow.xaml">
|
||||||
<Application.Resources />
|
<Application.Resources />
|
||||||
</Application>
|
</Application>
|
|
@ -1,10 +1,10 @@
|
||||||
using CommonServiceLocator;
|
using CommonServiceLocator;
|
||||||
using DeepSpeech.WPF.ViewModels;
|
using MozillaVoiceStt.WPF.ViewModels;
|
||||||
using DeepSpeechClient.Interfaces;
|
using MozillaVoiceSttClient.Interfaces;
|
||||||
using GalaSoft.MvvmLight.Ioc;
|
using GalaSoft.MvvmLight.Ioc;
|
||||||
using System.Windows;
|
using System.Windows;
|
||||||
|
|
||||||
namespace DeepSpeechWPF
|
namespace MozillaVoiceSttWPF
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Interaction logic for App.xaml
|
/// Interaction logic for App.xaml
|
||||||
|
@ -18,11 +18,11 @@ namespace DeepSpeechWPF
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
//Register instance of DeepSpeech
|
//Register instance of Mozilla Voice STT
|
||||||
DeepSpeechClient.DeepSpeech deepSpeechClient =
|
MozillaVoiceSttClient.MozillaVoiceStt mozillaVoiceSttClient =
|
||||||
new DeepSpeechClient.DeepSpeech("deepspeech-0.8.0-models.pbmm");
|
new MozillaVoiceSttClient.MozillaVoiceSttModel("deepspeech-0.8.0-models.pbmm");
|
||||||
|
|
||||||
SimpleIoc.Default.Register<IDeepSpeech>(() => deepSpeechClient);
|
SimpleIoc.Default.Register<IMozillaVoiceStt>(() => mozillaVoiceSttClient);
|
||||||
SimpleIoc.Default.Register<MainWindowViewModel>();
|
SimpleIoc.Default.Register<MainWindowViewModel>();
|
||||||
}
|
}
|
||||||
catch (System.Exception ex)
|
catch (System.Exception ex)
|
||||||
|
@ -35,8 +35,8 @@ namespace DeepSpeechWPF
|
||||||
protected override void OnExit(ExitEventArgs e)
|
protected override void OnExit(ExitEventArgs e)
|
||||||
{
|
{
|
||||||
base.OnExit(e);
|
base.OnExit(e);
|
||||||
//Dispose instance of DeepSpeech
|
//Dispose instance of MozillaVoiceStt
|
||||||
ServiceLocator.Current.GetInstance<IDeepSpeech>()?.Dispose();
|
ServiceLocator.Current.GetInstance<IMozillaVoiceStt>()?.Dispose();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,10 +1,10 @@
|
||||||
<Window
|
<Window
|
||||||
x:Class="DeepSpeechWPF.MainWindow"
|
x:Class="MozillaVoiceSttWPF.MainWindow"
|
||||||
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
|
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
|
||||||
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
|
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
|
||||||
xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
|
xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
|
||||||
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
||||||
Title="Deepspeech client"
|
Title="Mozilla Voice STT client"
|
||||||
Width="800"
|
Width="800"
|
||||||
Height="600"
|
Height="600"
|
||||||
Loaded="Window_Loaded"
|
Loaded="Window_Loaded"
|
|
@ -1,8 +1,8 @@
|
||||||
using CommonServiceLocator;
|
using CommonServiceLocator;
|
||||||
using DeepSpeech.WPF.ViewModels;
|
using MozillaVoiceStt.WPF.ViewModels;
|
||||||
using System.Windows;
|
using System.Windows;
|
||||||
|
|
||||||
namespace DeepSpeechWPF
|
namespace MozillaVoiceSttWPF
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Interaction logic for MainWindow.xaml
|
/// Interaction logic for MainWindow.xaml
|
|
@ -6,8 +6,8 @@
|
||||||
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
|
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
|
||||||
<ProjectGuid>{54BFD766-4305-4F4C-BA59-AF45505DF3C1}</ProjectGuid>
|
<ProjectGuid>{54BFD766-4305-4F4C-BA59-AF45505DF3C1}</ProjectGuid>
|
||||||
<OutputType>WinExe</OutputType>
|
<OutputType>WinExe</OutputType>
|
||||||
<RootNamespace>DeepSpeech.WPF</RootNamespace>
|
<RootNamespace>MozillaVoiceStt.WPF</RootNamespace>
|
||||||
<AssemblyName>DeepSpeech.WPF</AssemblyName>
|
<AssemblyName>MozillaVoiceStt.WPF</AssemblyName>
|
||||||
<TargetFrameworkVersion>v4.6.2</TargetFrameworkVersion>
|
<TargetFrameworkVersion>v4.6.2</TargetFrameworkVersion>
|
||||||
<FileAlignment>512</FileAlignment>
|
<FileAlignment>512</FileAlignment>
|
||||||
<ProjectTypeGuids>{60dc8134-eba5-43b8-bcc9-bb4bc16c2548};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
|
<ProjectTypeGuids>{60dc8134-eba5-43b8-bcc9-bb4bc16c2548};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
|
||||||
|
@ -131,9 +131,9 @@
|
||||||
<None Include="App.config" />
|
<None Include="App.config" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ProjectReference Include="..\..\..\ds\native_client\dotnet\DeepSpeechClient\DeepSpeechClient.csproj">
|
<ProjectReference Include="..\..\..\ds\native_client\dotnet\MozillaVoiceSttClient\MozillaVoiceSttClient.csproj">
|
||||||
<Project>{56de4091-bbbe-47e4-852d-7268b33b971f}</Project>
|
<Project>{56de4091-bbbe-47e4-852d-7268b33b971f}</Project>
|
||||||
<Name>DeepSpeechClient</Name>
|
<Name>MozillaVoiceSttClient</Name>
|
||||||
</ProjectReference>
|
</ProjectReference>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
|
@ -3,9 +3,9 @@ Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
# Visual Studio 15
|
# Visual Studio 15
|
||||||
VisualStudioVersion = 15.0.28307.421
|
VisualStudioVersion = 15.0.28307.421
|
||||||
MinimumVisualStudioVersion = 10.0.40219.1
|
MinimumVisualStudioVersion = 10.0.40219.1
|
||||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeech.WPF", "DeepSpeech.WPF.csproj", "{54BFD766-4305-4F4C-BA59-AF45505DF3C1}"
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MozillaVoiceStt.WPF", "MozillaVoiceStt.WPF.csproj", "{54BFD766-4305-4F4C-BA59-AF45505DF3C1}"
|
||||||
EndProject
|
EndProject
|
||||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechClient", "..\..\..\ds\native_client\dotnet\DeepSpeechClient\DeepSpeechClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}"
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MozillaVoiceSttClient", "..\..\..\ds\native_client\dotnet\MozillaVoiceSttClient\MozillaVoiceSttClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}"
|
||||||
EndProject
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
|
@ -7,11 +7,11 @@ using System.Windows;
|
||||||
// General Information about an assembly is controlled through the following
|
// General Information about an assembly is controlled through the following
|
||||||
// set of attributes. Change these attribute values to modify the information
|
// set of attributes. Change these attribute values to modify the information
|
||||||
// associated with an assembly.
|
// associated with an assembly.
|
||||||
[assembly: AssemblyTitle("DeepSpeech.WPF")]
|
[assembly: AssemblyTitle("MozillaVoiceStt.WPF")]
|
||||||
[assembly: AssemblyDescription("")]
|
[assembly: AssemblyDescription("")]
|
||||||
[assembly: AssemblyConfiguration("")]
|
[assembly: AssemblyConfiguration("")]
|
||||||
[assembly: AssemblyCompany("")]
|
[assembly: AssemblyCompany("")]
|
||||||
[assembly: AssemblyProduct("DeepSpeech.WPF.SingleFiles")]
|
[assembly: AssemblyProduct("MozillaVoiceStt.WPF.SingleFiles")]
|
||||||
[assembly: AssemblyCopyright("Copyright © 2018")]
|
[assembly: AssemblyCopyright("Copyright © 2018")]
|
||||||
[assembly: AssemblyTrademark("")]
|
[assembly: AssemblyTrademark("")]
|
||||||
[assembly: AssemblyCulture("")]
|
[assembly: AssemblyCulture("")]
|
|
@ -8,7 +8,7 @@
|
||||||
// </auto-generated>
|
// </auto-generated>
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
namespace DeepSpeech.WPF.Properties {
|
namespace MozillaVoiceStt.WPF.Properties {
|
||||||
using System;
|
using System;
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ namespace DeepSpeech.WPF.Properties {
|
||||||
internal static global::System.Resources.ResourceManager ResourceManager {
|
internal static global::System.Resources.ResourceManager ResourceManager {
|
||||||
get {
|
get {
|
||||||
if (object.ReferenceEquals(resourceMan, null)) {
|
if (object.ReferenceEquals(resourceMan, null)) {
|
||||||
global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("DeepSpeech.WPF.Properties.Resources", typeof(Resources).Assembly);
|
global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("MozillaVoiceStt.WPF.Properties.Resources", typeof(Resources).Assembly);
|
||||||
resourceMan = temp;
|
resourceMan = temp;
|
||||||
}
|
}
|
||||||
return resourceMan;
|
return resourceMan;
|
|
@ -8,7 +8,7 @@
|
||||||
// </auto-generated>
|
// </auto-generated>
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
namespace DeepSpeech.WPF.Properties {
|
namespace MozillaVoiceStt.WPF.Properties {
|
||||||
|
|
||||||
|
|
||||||
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
|
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
|
|
@ -3,7 +3,7 @@ using System.Collections.Generic;
|
||||||
using System.ComponentModel;
|
using System.ComponentModel;
|
||||||
using System.Runtime.CompilerServices;
|
using System.Runtime.CompilerServices;
|
||||||
|
|
||||||
namespace DeepSpeech.WPF.ViewModels
|
namespace MozillaVoiceStt.WPF.ViewModels
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Implementation of <see cref="INotifyPropertyChanged"/> to simplify models.
|
/// Implementation of <see cref="INotifyPropertyChanged"/> to simplify models.
|
|
@ -3,8 +3,8 @@ using CSCore;
|
||||||
using CSCore.CoreAudioAPI;
|
using CSCore.CoreAudioAPI;
|
||||||
using CSCore.SoundIn;
|
using CSCore.SoundIn;
|
||||||
using CSCore.Streams;
|
using CSCore.Streams;
|
||||||
using DeepSpeechClient.Interfaces;
|
using MozillaVoiceSttClient.Interfaces;
|
||||||
using DeepSpeechClient.Models;
|
using MozillaVoiceSttClient.Models;
|
||||||
using GalaSoft.MvvmLight.CommandWpf;
|
using GalaSoft.MvvmLight.CommandWpf;
|
||||||
using Microsoft.Win32;
|
using Microsoft.Win32;
|
||||||
using System;
|
using System;
|
||||||
|
@ -15,7 +15,7 @@ using System.IO;
|
||||||
using System.Threading;
|
using System.Threading;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
|
|
||||||
namespace DeepSpeech.WPF.ViewModels
|
namespace MozillaVoiceStt.WPF.ViewModels
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// View model of the MainWindow View.
|
/// View model of the MainWindow View.
|
||||||
|
@ -27,7 +27,7 @@ namespace DeepSpeech.WPF.ViewModels
|
||||||
private const string ScorerPath = "kenlm.scorer";
|
private const string ScorerPath = "kenlm.scorer";
|
||||||
#endregion
|
#endregion
|
||||||
|
|
||||||
private readonly IDeepSpeech _sttClient;
|
private readonly IMozillaVoiceSttClient _sttClient;
|
||||||
|
|
||||||
#region Commands
|
#region Commands
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
@ -62,7 +62,7 @@ namespace DeepSpeech.WPF.ViewModels
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Stream used to feed data into the acoustic model.
|
/// Stream used to feed data into the acoustic model.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private DeepSpeechStream _sttStream;
|
private MozillaVoiceSttStream _sttStream;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Records the audio of the selected device.
|
/// Records the audio of the selected device.
|
||||||
|
@ -75,7 +75,7 @@ namespace DeepSpeech.WPF.ViewModels
|
||||||
private SoundInSource _soundInSource;
|
private SoundInSource _soundInSource;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Target wave source.(16KHz Mono 16bit for DeepSpeech)
|
/// Target wave source.(16KHz Mono 16bit for MozillaVoiceStt)
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private IWaveSource _convertedSource;
|
private IWaveSource _convertedSource;
|
||||||
|
|
||||||
|
@ -200,7 +200,7 @@ namespace DeepSpeech.WPF.ViewModels
|
||||||
#endregion
|
#endregion
|
||||||
|
|
||||||
#region Ctors
|
#region Ctors
|
||||||
public MainWindowViewModel(IDeepSpeech sttClient)
|
public MainWindowViewModel(IMozillaVoiceStt sttClient)
|
||||||
{
|
{
|
||||||
_sttClient = sttClient;
|
_sttClient = sttClient;
|
||||||
|
|
||||||
|
@ -290,7 +290,7 @@ namespace DeepSpeech.WPF.ViewModels
|
||||||
//read data from the converedSource
|
//read data from the converedSource
|
||||||
//important: don't use the e.Data here
|
//important: don't use the e.Data here
|
||||||
//the e.Data contains the raw data provided by the
|
//the e.Data contains the raw data provided by the
|
||||||
//soundInSource which won't have the deepspeech required audio format
|
//soundInSource which won't have the model required audio format
|
||||||
byte[] buffer = new byte[_convertedSource.WaveFormat.BytesPerSecond / 2];
|
byte[] buffer = new byte[_convertedSource.WaveFormat.BytesPerSecond / 2];
|
||||||
|
|
||||||
int read;
|
int read;
|
|
@ -8,29 +8,29 @@ Only difference for both OS is the library used for gathering audio data from
|
||||||
Interface to both the libs is provided through NIM code.
|
Interface to both the libs is provided through NIM code.
|
||||||
|
|
||||||
## PREREQUISITIES :
|
## PREREQUISITIES :
|
||||||
* ```libdeepspeech.so```
|
* ```libmozilla_voice_stt.so```
|
||||||
|
|
||||||
Go to the [releases](https://github.com/mozilla/DeepSpeech/releases/tag/v0.8.0) page and download the native client package based on your OS and CPU architecture.
|
Go to the [releases](https://github.com/mozilla/DeepSpeech/releases/tag/v0.8.0) page and download the native client package based on your OS and CPU architecture.
|
||||||
|
|
||||||
Extract the ``libdeepspeech.so`` and put into the subdirectory depending on OS of native Client used.
|
Extract the ``libmozilla_voice_stt.so`` and put into the subdirectory depending on OS of native Client used.
|
||||||
|
|
||||||
#### On WINDOWS:
|
#### On WINDOWS:
|
||||||
* Download the ```native.client.amd64.win.tar.xz ``` package . [ same is true for ``xx.xx.amd64.cuda.win.xx`` if CUDA installed or ``xx.xx.amd64.tflite.win.xx``]
|
* Download the ```native.client.amd64.win.tar.xz ``` package . [ same is true for ``xx.xx.amd64.cuda.win.xx`` if CUDA installed or ``xx.xx.amd64.tflite.win.xx``]
|
||||||
* Extract and place the ```libdeepspeech.so``` in ```win_nim_vad_streaming``` subdirectory
|
* Extract and place the ```libmozilla_voice_stt.so``` in ```win_nim_vad_streaming``` subdirectory
|
||||||
* Now see ``README.md`` in ```win_nim_vad_streaming``` subdirectory.
|
* Now see ``README.md`` in ```win_nim_vad_streaming``` subdirectory.
|
||||||
|
|
||||||
#### On LINUX:
|
#### On LINUX:
|
||||||
* Download the ```native_client.amd64.linux.cpu ``` package .[ same is true for ``xx.xx.amd64.cuda.linux.xx`` is CUDA installed or ``xx.xx.amd64.tflite.linux.xx``]
|
* Download the ```native_client.amd64.linux.cpu ``` package .[ same is true for ``xx.xx.amd64.cuda.linux.xx`` is CUDA installed or ``xx.xx.amd64.tflite.linux.xx``]
|
||||||
* Extract and place the ```libdeepspeech.so``` in ```linux_nim_vad_streaming``` subdirectory
|
* Extract and place the ```libmozilla_voice_stt.so``` in ```linux_nim_vad_streaming``` subdirectory
|
||||||
* Now see ``README.md`` in ```linux_nim_vad_streaming``` subdirectory.
|
* Now see ``README.md`` in ```linux_nim_vad_streaming``` subdirectory.
|
||||||
|
|
||||||
_Note: One can put ``libdeepspeech.so`` in the system's PATH rather than copying it to one of subdirectories for easy usage._
|
_Note: One can put ``libmozilla_voice_stt.so`` in the system's PATH rather than copying it to one of subdirectories for easy usage._
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## NOTE:
|
## NOTE:
|
||||||
Used NIM code only depends on the shared library(``libdeepspeech.so``) used.
|
Used NIM code only depends on the shared library(``libmozilla_voice_stt.so``) used.
|
||||||
Given one has downloaded the native client package and extracted the ``libdeepspeech.so`` shared library and copied it to one of the subdirectories or in system's PATH ,Code can be modified to add more functionalities in pure NIM and modified code would compile on any platform as long as that platform is supported by NIM.
|
Given one has downloaded the native client package and extracted the ``libmozilla_voice_stt.so`` shared library and copied it to one of the subdirectories or in system's PATH ,Code can be modified to add more functionalities in pure NIM and modified code would compile on any platform as long as that platform is supported by NIM.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
# MICROPHONE VAD STREAMING
|
# MICROPHONE VAD STREAMING
|
||||||
Minimalistic example to demonstrate the DeepSpeech streaming API in NIM.Raw audio is streamed from microphone to the DeepSpeech based on VAD (voice Activity Detection).
|
Minimalistic example to demonstrate the Mozilla Voice STT streaming API in NIM.Raw audio is streamed from microphone to the Mozilla Voice STT based on VAD (voice Activity Detection).
|
||||||
|
|
||||||
## Prerequisites:
|
## Prerequisites:
|
||||||
0) Please read ``PREREQUISITES`` in [README](../README.md) for getting the required ``libdeepspeech.so`` shared library.
|
0) Please read ``PREREQUISITES`` in [README](../README.md) for getting the required ``libmozilla_voice_stt.so`` shared library.
|
||||||
1) This example depends on the ``libasound.so``(which is distributed along with all major linux distros and present in linker's default path)
|
1) This example depends on the ``libasound.so``(which is distributed along with all major linux distros and present in linker's default path)
|
||||||
|
|
||||||
_Note: You may need to install ``libasound.so`` if not found_
|
_Note: You may need to install ``libasound.so`` if not found_
|
||||||
```
|
```
|
||||||
sudo apt-get install libasound2
|
sudo apt-get install libasound2
|
||||||
```
|
```
|
||||||
2) Download the pre-trained DeepSpeech english model (1089MB) and Scorer Package(~900MB):
|
2) Download the pre-trained Mozilla Voice STT english model (1089MB) and Scorer Package(~900MB):
|
||||||
|
|
||||||
```
|
```
|
||||||
wget https://github.com/mozilla/DeepSpeech/releases/download/v0.8.0/deepspeech-0.8.0-models.pbmm
|
wget https://github.com/mozilla/DeepSpeech/releases/download/v0.8.0/deepspeech-0.8.0-models.pbmm
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import os ,deques,math,strutils,parseopt,tables,strformat
|
import os ,deques,math,strutils,parseopt,tables,strformat
|
||||||
import alsa,webrtcvad,wav
|
import alsa,webrtcvad,wav
|
||||||
import deepspeech
|
import mozilla_voice_stt
|
||||||
|
|
||||||
var
|
var
|
||||||
args = initTable[string, string]()
|
args = initTable[string, string]()
|
||||||
|
@ -28,8 +28,8 @@ let
|
||||||
hw_params: snd_pcm_hw_params_ref = nil
|
hw_params: snd_pcm_hw_params_ref = nil
|
||||||
device_name = "plughw:0,0" #PCM hardware alsa Device.
|
device_name = "plughw:0,0" #PCM hardware alsa Device.
|
||||||
size = (int((frameDuration*int(rate))/1000))
|
size = (int((frameDuration*int(rate))/1000))
|
||||||
modelPtr: ModelState = nil #deepSpeech model
|
modelPtr: ModelState = nil #mozilla Voice Stt model
|
||||||
deepStreamPtr: StreamingState = nil #deepSpeech model stream
|
deepStreamPtr: StreamingState = nil #mozilla Voice Stt model stream
|
||||||
modelPath = args["model"]
|
modelPath = args["model"]
|
||||||
|
|
||||||
var
|
var
|
||||||
|
@ -40,7 +40,7 @@ var
|
||||||
framesLen: clong
|
framesLen: clong
|
||||||
vad:vadObj #VAD Object declaration
|
vad:vadObj #VAD Object declaration
|
||||||
codeV: cint #to hold the error codes for VAD.
|
codeV: cint #to hold the error codes for VAD.
|
||||||
codeD: cint #to hold the error codes for deepSpeech
|
codeD: cint #to hold the error codes for mozilla Voice Stt
|
||||||
#to get the data from the channel.
|
#to get the data from the channel.
|
||||||
frame : seq[int16]
|
frame : seq[int16]
|
||||||
buff = initDeque[tuple[data: seq[int16],flag:int32]](nextPowerOfTwo(windowSize))
|
buff = initDeque[tuple[data: seq[int16],flag:int32]](nextPowerOfTwo(windowSize))
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
# MICROPHONE VAD STREAMING
|
# MICROPHONE VAD STREAMING
|
||||||
Minimalistic example to demonstrate the DeepSpeech streaming API in NIM.Raw audio is streamed from microphone to the DeepSpeech based on VAD (voice Activity Detection).
|
Minimalistic example to demonstrate the Mozilla Voice STT streaming API in NIM.Raw audio is streamed from microphone to the Mozilla Voice STT based on VAD (voice Activity Detection).
|
||||||
|
|
||||||
## Prerequisites:
|
## Prerequisites:
|
||||||
0) Please read ``PREREQUISITES`` in [README](../README.md) for getting the required ``libdeepspeech.so`` shared library.
|
0) Please read ``PREREQUISITES`` in [README](../README.md) for getting the required ``libmozilla_voice_stt.so`` shared library.
|
||||||
1) This example depends on the ``libportaudio.dll``(precompiled portaudio library).Make sure you have this library in PATH.If you don't have one or are unable to build one ,you can get one from [here](https://gitlab.com/eagledot/nim-portaudio/lib).
|
1) This example depends on the ``libportaudio.dll``(precompiled portaudio library).Make sure you have this library in PATH.If you don't have one or are unable to build one ,you can get one from [here](https://gitlab.com/eagledot/nim-portaudio/lib).
|
||||||
|
|
||||||
2) Download the pre-trained DeepSpeech english model (1089MB):
|
2) Download the pre-trained Mozilla Voice STT english model (1089MB):
|
||||||
|
|
||||||
```
|
```
|
||||||
wget https://github.com/mozilla/DeepSpeech/releases/download/v0.8.0/deepspeech-0.8.0-models.pbmm
|
wget https://github.com/mozilla/DeepSpeech/releases/download/v0.8.0/deepspeech-0.8.0-models.pbmm
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import os ,deques,math,strutils,parseopt,tables
|
import os ,deques,math,strutils,parseopt,tables
|
||||||
import strformat
|
import strformat
|
||||||
import webrtcvad,portaudio,deepspeech,wav
|
import webrtcvad,portaudio,mozilla_voice_stt,wav
|
||||||
|
|
||||||
|
|
||||||
proc sum[T](temp: Deque[T]): int =
|
proc sum[T](temp: Deque[T]): int =
|
||||||
|
@ -47,8 +47,8 @@ let
|
||||||
f1 = open("FIFO_rgb",fmWrite)
|
f1 = open("FIFO_rgb",fmWrite)
|
||||||
f2 = open("FIFO_rgb",fmREAD)
|
f2 = open("FIFO_rgb",fmREAD)
|
||||||
stream: pointer = nil #portaudio Stream pointer holder.
|
stream: pointer = nil #portaudio Stream pointer holder.
|
||||||
modelPtr: ModelState = nil #deepSpeech model
|
modelPtr: ModelState = nil #mozilla Voice Stt model
|
||||||
deepStreamPtr: StreamingState = nil #deepSpeech model stream
|
deepStreamPtr: StreamingState = nil #mozilla Voice Stt model stream
|
||||||
modelPath = args["model"]
|
modelPath = args["model"]
|
||||||
if "scorer" in args:
|
if "scorer" in args:
|
||||||
scorerPath = args["scorer"]
|
scorerPath = args["scorer"]
|
||||||
|
@ -68,7 +68,7 @@ when isMainModule:
|
||||||
codeV = setMode(vad,3'i32)
|
codeV = setMode(vad,3'i32)
|
||||||
assert codeV == 0'i32
|
assert codeV == 0'i32
|
||||||
|
|
||||||
#DeepSpeech model initialization.
|
#Mozilla Voice STT model initialization.
|
||||||
codeD = createModel(modelPath,unsafeaddr(modelPtr))
|
codeD = createModel(modelPath,unsafeaddr(modelPtr))
|
||||||
if codeD == 0'i32:
|
if codeD == 0'i32:
|
||||||
echo("Model Created Successfully")
|
echo("Model Created Successfully")
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
# NodeJS Microphone VAD Streaming
|
# NodeJS Microphone VAD Streaming
|
||||||
|
|
||||||
This is a NodeJS example of recording from the microphone and streaming to
|
This is a NodeJS example of recording from the microphone and streaming to
|
||||||
DeepSpeech with voice activity detection.
|
Mozilla Voice STT with voice activity detection.
|
||||||
|
|
||||||
### Prerequisites:
|
### Prerequisites:
|
||||||
|
|
||||||
1) The example utilized the [mic](https://github.com/ashishbajaj99/mic) NPM module which requires
|
1) The example utilized the [mic](https://github.com/ashishbajaj99/mic) NPM module which requires
|
||||||
either [sox](http://sox.sourceforge.net/) (Windows/Mac) or [arecord](http://alsa-project.org/) (Linux).
|
either [sox](http://sox.sourceforge.net/) (Windows/Mac) or [arecord](http://alsa-project.org/) (Linux).
|
||||||
|
|
||||||
2) Download the pre-trained DeepSpeech english model (1089MB):
|
2) Download the pre-trained Mozilla Voice STT english model (1089MB):
|
||||||
|
|
||||||
```
|
```
|
||||||
wget https://github.com/mozilla/DeepSpeech/releases/download/v0.8.0/deepspeech-0.8.0-models.pbmm
|
wget https://github.com/mozilla/DeepSpeech/releases/download/v0.8.0/deepspeech-0.8.0-models.pbmm
|
||||||
|
@ -35,7 +35,7 @@ npm install
|
||||||
node start.js
|
node start.js
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Specify alternate DeepSpeech model path:
|
#### Specify alternate Mozilla Voice STT model path:
|
||||||
|
|
||||||
Use the `DEEPSPEECH_MODEL` environment variable to change models.
|
Use the `DEEPSPEECH_MODEL` environment variable to change models.
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
"version": "0.1.0",
|
"version": "0.1.0",
|
||||||
"private": true,
|
"private": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"deepspeech": "^0.8.0",
|
"@mozilla-voice/stt": "^0.9.0-alpha.5",
|
||||||
"mic": "^2.1.2",
|
"mic": "^2.1.2",
|
||||||
"node-vad": "^1.1.4",
|
"node-vad": "^1.1.4",
|
||||||
"speaker": "^0.5.1",
|
"speaker": "^0.5.1",
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
const DeepSpeech = require('deepspeech');
|
const mozillaVoiceStt = require('@mozilla-voice/stt');
|
||||||
const VAD = require('node-vad');
|
const VAD = require('node-vad');
|
||||||
const mic = require('mic');
|
const mic = require('mic');
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
const wav = require('wav');
|
const wav = require('wav');
|
||||||
const Speaker = require('speaker');
|
const Speaker = require('speaker');
|
||||||
|
|
||||||
let DEEPSPEECH_MODEL; // path to deepspeech model directory
|
let DEEPSPEECH_MODEL; // path to model directory
|
||||||
if (process.env.DEEPSPEECH_MODEL) {
|
if (process.env.DEEPSPEECH_MODEL) {
|
||||||
DEEPSPEECH_MODEL = process.env.DEEPSPEECH_MODEL;
|
DEEPSPEECH_MODEL = process.env.DEEPSPEECH_MODEL;
|
||||||
}
|
}
|
||||||
|
@ -24,7 +24,7 @@ const vad = new VAD(VAD_MODE);
|
||||||
function createModel(modelDir) {
|
function createModel(modelDir) {
|
||||||
let modelPath = modelDir + '.pbmm';
|
let modelPath = modelDir + '.pbmm';
|
||||||
let scorerPath = modelDir + '.scorer';
|
let scorerPath = modelDir + '.scorer';
|
||||||
let model = new DeepSpeech.Model(modelPath);
|
let model = new mozillaVoiceStt.Model(modelPath);
|
||||||
model.enableExternalScorer(scorerPath);
|
model.enableExternalScorer(scorerPath);
|
||||||
return model;
|
return model;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# NodeJS voice recognition example using Mozilla DeepSpeech
|
# NodeJS voice recognition example using Mozilla Mozilla Voice STT
|
||||||
|
|
||||||
Download the pre-trained model (1.8GB):
|
Download the pre-trained model (1.8GB):
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
const DeepSpeech = require('deepspeech');
|
const mozillaVoiceStt = require('@mozilla-voice/stt');
|
||||||
const Fs = require('fs');
|
const Fs = require('fs');
|
||||||
const Sox = require('sox-stream');
|
const Sox = require('sox-stream');
|
||||||
const MemoryStream = require('memory-stream');
|
const MemoryStream = require('memory-stream');
|
||||||
|
@ -7,7 +7,7 @@ const Wav = require('node-wav');
|
||||||
|
|
||||||
let modelPath = './models/deepspeech-0.8.0-models.pbmm';
|
let modelPath = './models/deepspeech-0.8.0-models.pbmm';
|
||||||
|
|
||||||
let model = new DeepSpeech.Model(modelPath);
|
let model = new mozillaVoiceStt.Model(modelPath);
|
||||||
|
|
||||||
let desiredSampleRate = model.sampleRate();
|
let desiredSampleRate = model.sampleRate();
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"name": "deepspeech-nodejs_wav",
|
"name": "mozilla_voice_stt-nodejs_wav",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"description": "Simple audio processing",
|
"description": "Simple audio processing",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
|
@ -8,7 +8,7 @@
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"argparse": "^1.0.10",
|
"argparse": "^1.0.10",
|
||||||
"deepspeech": "0.8.0",
|
"@mozilla-voice/stt": "0.9.0-alpha.5",
|
||||||
"node-wav": "0.0.2",
|
"node-wav": "0.0.2",
|
||||||
"sox-stream": "^2.0.3",
|
"sox-stream": "^2.0.3",
|
||||||
"util": "^0.11.1"
|
"util": "^0.11.1"
|
||||||
|
|
2
tests.sh
|
@ -19,5 +19,5 @@ get_python_wheel_url()
|
||||||
|
|
||||||
get_npm_package_url()
|
get_npm_package_url()
|
||||||
{
|
{
|
||||||
echo "https://community-tc.services.mozilla.com/api/queue/v1/task/${DEP_TASK_ID}/artifacts/public/mozilla_voice_stt-${DS_VERSION}.tgz"
|
echo "https://community-tc.services.mozilla.com/api/queue/v1/task/${DEP_TASK_ID}/artifacts/public/mozilla-voice-stt-${DS_VERSION}.tgz"
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
# Visual Studio Version 16
|
# Visual Studio Version 16
|
||||||
VisualStudioVersion = 16.0.29519.87
|
VisualStudioVersion = 16.0.29519.87
|
||||||
MinimumVisualStudioVersion = 10.0.40219.1
|
MinimumVisualStudioVersion = 10.0.40219.1
|
||||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechUWP", "DeepSpeechUWP\DeepSpeechUWP.csproj", "{49AAC24D-6A76-4910-913A-94D2D67B6226}"
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MozillaVoiceSttUWP", "MozillaVoiceSttUWP\MozillaVoiceSttUWP.csproj", "{49AAC24D-6A76-4910-913A-94D2D67B6226}"
|
||||||
EndProject
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
|
@ -1,7 +1,7 @@
|
||||||
<Application
|
<Application
|
||||||
x:Class="DeepSpeechUWP.App"
|
x:Class="MozillaVoiceSttUWP.App"
|
||||||
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
|
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
|
||||||
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
|
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
|
||||||
xmlns:local="using:DeepSpeechUWP">
|
xmlns:local="using:MozillaVoiceSttUWP">
|
||||||
|
|
||||||
</Application>
|
</Application>
|
|
@ -16,7 +16,7 @@ using Windows.UI.Xaml.Input;
|
||||||
using Windows.UI.Xaml.Media;
|
using Windows.UI.Xaml.Media;
|
||||||
using Windows.UI.Xaml.Navigation;
|
using Windows.UI.Xaml.Navigation;
|
||||||
|
|
||||||
namespace DeepSpeechUWP
|
namespace MozillaVoiceSttUWP
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Provides application-specific behavior to supplement the default Application class.
|
/// Provides application-specific behavior to supplement the default Application class.
|
До Ширина: | Высота: | Размер: 1.4 KiB После Ширина: | Высота: | Размер: 1.4 KiB |
До Ширина: | Высота: | Размер: 7.5 KiB После Ширина: | Высота: | Размер: 7.5 KiB |
До Ширина: | Высота: | Размер: 2.9 KiB После Ширина: | Высота: | Размер: 2.9 KiB |
До Ширина: | Высота: | Размер: 1.6 KiB После Ширина: | Высота: | Размер: 1.6 KiB |
До Ширина: | Высота: | Размер: 1.2 KiB После Ширина: | Высота: | Размер: 1.2 KiB |
До Ширина: | Высота: | Размер: 1.4 KiB После Ширина: | Высота: | Размер: 1.4 KiB |
До Ширина: | Высота: | Размер: 3.1 KiB После Ширина: | Высота: | Размер: 3.1 KiB |
|
@ -1,8 +1,8 @@
|
||||||
<Page
|
<Page
|
||||||
x:Class="DeepSpeechUWP.MainPage"
|
x:Class="MozillaVoiceSttUWP.MainPage"
|
||||||
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
|
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
|
||||||
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
|
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
|
||||||
xmlns:local="using:DeepSpeechUWP"
|
xmlns:local="using:MozillaVoiceSttUWP"
|
||||||
xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
|
xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
|
||||||
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
||||||
mc:Ignorable="d"
|
mc:Ignorable="d"
|
||||||
|
@ -15,7 +15,7 @@
|
||||||
<RowDefinition Height="40" />
|
<RowDefinition Height="40" />
|
||||||
</Grid.RowDefinitions>
|
</Grid.RowDefinitions>
|
||||||
<StackPanel Grid.Row="0">
|
<StackPanel Grid.Row="0">
|
||||||
<TextBlock FontSize="30" FontWeight="Bold">DeepSpeech UWP Demo</TextBlock>
|
<TextBlock FontSize="30" FontWeight="Bold">MozillaVoiceStt UWP Demo</TextBlock>
|
||||||
<TextBlock FontSize="20" Margin="0 4 0 40">powered by Audio Graph API</TextBlock>
|
<TextBlock FontSize="20" Margin="0 4 0 40">powered by Audio Graph API</TextBlock>
|
||||||
<TextBlock Margin="0 0 0 10">Select an audio for transcription:</TextBlock>
|
<TextBlock Margin="0 0 0 10">Select an audio for transcription:</TextBlock>
|
||||||
<Grid ColumnSpacing="10">
|
<Grid ColumnSpacing="10">
|
|
@ -1,5 +1,5 @@
|
||||||
using DeepSpeechClient.Interfaces;
|
using MozillaVoiceSttClient.Interfaces;
|
||||||
using DeepSpeechClient.Models;
|
using MozillaVoiceSttClient.Models;
|
||||||
using System;
|
using System;
|
||||||
using System.Collections.Concurrent;
|
using System.Collections.Concurrent;
|
||||||
using System.Diagnostics;
|
using System.Diagnostics;
|
||||||
|
@ -17,7 +17,7 @@ using Windows.Storage;
|
||||||
using Windows.UI.Xaml;
|
using Windows.UI.Xaml;
|
||||||
using Windows.UI.Xaml.Controls;
|
using Windows.UI.Xaml.Controls;
|
||||||
|
|
||||||
namespace DeepSpeechUWP
|
namespace MozillaVoiceSttUWP
|
||||||
{
|
{
|
||||||
[ComImport]
|
[ComImport]
|
||||||
[Guid("5B0D3235-4DBA-4D44-865E-8F1D0E4FD04D")]
|
[Guid("5B0D3235-4DBA-4D44-865E-8F1D0E4FD04D")]
|
||||||
|
@ -34,8 +34,8 @@ namespace DeepSpeechUWP
|
||||||
private StorageFile audioFile;
|
private StorageFile audioFile;
|
||||||
private DeviceInformation selectedInputDevice;
|
private DeviceInformation selectedInputDevice;
|
||||||
private DeviceInformationCollection inputDevices;
|
private DeviceInformationCollection inputDevices;
|
||||||
private IDeepSpeech client;
|
private IMozillaVoiceSttClient client;
|
||||||
private DeepSpeechStream stream;
|
private MozillaVoiceSttStream stream;
|
||||||
private MediaEncodingProfile encoding;
|
private MediaEncodingProfile encoding;
|
||||||
private AudioGraph graph;
|
private AudioGraph graph;
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ namespace DeepSpeechUWP
|
||||||
public MainPage()
|
public MainPage()
|
||||||
{
|
{
|
||||||
this.InitializeComponent();
|
this.InitializeComponent();
|
||||||
InitDeepSpeech();
|
InitMozillaVoiceStt();
|
||||||
ListAudioInputDevices();
|
ListAudioInputDevices();
|
||||||
InitAudioGraph();
|
InitAudioGraph();
|
||||||
}
|
}
|
||||||
|
@ -61,14 +61,14 @@ namespace DeepSpeechUWP
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void InitDeepSpeech()
|
private void InitMozillaVoiceStt()
|
||||||
{
|
{
|
||||||
string projectFolder = Directory.GetCurrentDirectory();
|
string projectFolder = Directory.GetCurrentDirectory();
|
||||||
string modelsFolder = Path.Combine(projectFolder, "models");
|
string modelsFolder = Path.Combine(projectFolder, "models");
|
||||||
string acousticModelPath = Path.Combine(modelsFolder, "deepspeech-0.8.0-models.pbmm");
|
string acousticModelPath = Path.Combine(modelsFolder, "deepspeech-0.8.0-models.pbmm");
|
||||||
string scorerPath = Path.Combine(modelsFolder, "deepspeech-0.8.0-models.scorer");
|
string scorerPath = Path.Combine(modelsFolder, "deepspeech-0.8.0-models.scorer");
|
||||||
|
|
||||||
client = new DeepSpeechClient.DeepSpeech(acousticModelPath);
|
client = new MozillaVoiceSttClient.MozillaVoiceSttModel(acousticModelPath);
|
||||||
client.EnableExternalScorer(scorerPath);
|
client.EnableExternalScorer(scorerPath);
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,8 +7,8 @@
|
||||||
<ProjectGuid>{49AAC24D-6A76-4910-913A-94D2D67B6226}</ProjectGuid>
|
<ProjectGuid>{49AAC24D-6A76-4910-913A-94D2D67B6226}</ProjectGuid>
|
||||||
<OutputType>AppContainerExe</OutputType>
|
<OutputType>AppContainerExe</OutputType>
|
||||||
<AppDesignerFolder>Properties</AppDesignerFolder>
|
<AppDesignerFolder>Properties</AppDesignerFolder>
|
||||||
<RootNamespace>DeepSpeechUWP</RootNamespace>
|
<RootNamespace>MozillaVoiceSttUWP</RootNamespace>
|
||||||
<AssemblyName>DeepSpeechUWP</AssemblyName>
|
<AssemblyName>MozillaVoiceSttUWP</AssemblyName>
|
||||||
<DefaultLanguage>en-US</DefaultLanguage>
|
<DefaultLanguage>en-US</DefaultLanguage>
|
||||||
<TargetPlatformIdentifier>UAP</TargetPlatformIdentifier>
|
<TargetPlatformIdentifier>UAP</TargetPlatformIdentifier>
|
||||||
<TargetPlatformVersion Condition=" '$(TargetPlatformVersion)' == '' ">10.0.18362.0</TargetPlatformVersion>
|
<TargetPlatformVersion Condition=" '$(TargetPlatformVersion)' == '' ">10.0.18362.0</TargetPlatformVersion>
|
||||||
|
@ -158,14 +158,14 @@
|
||||||
</Page>
|
</Page>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="DeepSpeech">
|
<PackageReference Include="MozillaVoiceSttClient">
|
||||||
<Version>0.8.0</Version>
|
<Version>0.9.0-alpha.5</Version>
|
||||||
</PackageReference>
|
</PackageReference>
|
||||||
<PackageReference Include="Microsoft.NETCore.UniversalWindowsPlatform">
|
<PackageReference Include="Microsoft.NETCore.UniversalWindowsPlatform">
|
||||||
<Version>6.2.9</Version>
|
<Version>6.2.9</Version>
|
||||||
</PackageReference>
|
</PackageReference>
|
||||||
<Reference Include="DeepSpeechClient, Version=1.0.0.0, Culture=neutral, processorArchitecture=AMD64">
|
<Reference Include="MozillaVoiceSttClient, Version=1.0.0.0, Culture=neutral, processorArchitecture=AMD64">
|
||||||
<HintPath>..\packages\DeepSpeech.0.8.0\lib\net46\DeepSpeechClient.dll</HintPath>
|
<HintPath>..\packages\MozillaVoiceSttClient.0.9.0-alpha.5\lib\net46\MozillaVoiceSttClient.dll</HintPath>
|
||||||
</Reference>
|
</Reference>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
|
@ -14,7 +14,7 @@
|
||||||
<mp:PhoneIdentity PhoneProductId="a79d1931-db08-441d-b5ce-1c9cf6b1c8ff" PhonePublisherId="00000000-0000-0000-0000-000000000000"/>
|
<mp:PhoneIdentity PhoneProductId="a79d1931-db08-441d-b5ce-1c9cf6b1c8ff" PhonePublisherId="00000000-0000-0000-0000-000000000000"/>
|
||||||
|
|
||||||
<Properties>
|
<Properties>
|
||||||
<DisplayName>DeepSpeechUWP</DisplayName>
|
<DisplayName>MozillaVoiceSttUWP</DisplayName>
|
||||||
<PublisherDisplayName>erikz</PublisherDisplayName>
|
<PublisherDisplayName>erikz</PublisherDisplayName>
|
||||||
<Logo>Assets\StoreLogo.png</Logo>
|
<Logo>Assets\StoreLogo.png</Logo>
|
||||||
</Properties>
|
</Properties>
|
||||||
|
@ -30,12 +30,12 @@
|
||||||
<Applications>
|
<Applications>
|
||||||
<Application Id="App"
|
<Application Id="App"
|
||||||
Executable="$targetnametoken$.exe"
|
Executable="$targetnametoken$.exe"
|
||||||
EntryPoint="DeepSpeechUWP.App">
|
EntryPoint="MozillaVoiceSttUWP.App">
|
||||||
<uap:VisualElements
|
<uap:VisualElements
|
||||||
DisplayName="DeepSpeechUWP"
|
DisplayName="MozillaVoiceSttUWP"
|
||||||
Square150x150Logo="Assets\Square150x150Logo.png"
|
Square150x150Logo="Assets\Square150x150Logo.png"
|
||||||
Square44x44Logo="Assets\Square44x44Logo.png"
|
Square44x44Logo="Assets\Square44x44Logo.png"
|
||||||
Description="DeepSpeechUWP"
|
Description="MozillaVoiceSttUWP"
|
||||||
BackgroundColor="transparent">
|
BackgroundColor="transparent">
|
||||||
<uap:DefaultTile Wide310x150Logo="Assets\Wide310x150Logo.png"/>
|
<uap:DefaultTile Wide310x150Logo="Assets\Wide310x150Logo.png"/>
|
||||||
<uap:SplashScreen Image="Assets\SplashScreen.png" />
|
<uap:SplashScreen Image="Assets\SplashScreen.png" />
|
|
@ -5,11 +5,11 @@ using System.Runtime.InteropServices;
|
||||||
// General Information about an assembly is controlled through the following
|
// General Information about an assembly is controlled through the following
|
||||||
// set of attributes. Change these attribute values to modify the information
|
// set of attributes. Change these attribute values to modify the information
|
||||||
// associated with an assembly.
|
// associated with an assembly.
|
||||||
[assembly: AssemblyTitle("DeepSpeechUWP")]
|
[assembly: AssemblyTitle("MozillaVoiceSttUWP")]
|
||||||
[assembly: AssemblyDescription("")]
|
[assembly: AssemblyDescription("")]
|
||||||
[assembly: AssemblyConfiguration("")]
|
[assembly: AssemblyConfiguration("")]
|
||||||
[assembly: AssemblyCompany("")]
|
[assembly: AssemblyCompany("")]
|
||||||
[assembly: AssemblyProduct("DeepSpeechUWP")]
|
[assembly: AssemblyProduct("MozillaVoiceSttUWP")]
|
||||||
[assembly: AssemblyCopyright("Copyright © 2020")]
|
[assembly: AssemblyCopyright("Copyright © 2020")]
|
||||||
[assembly: AssemblyTrademark("")]
|
[assembly: AssemblyTrademark("")]
|
||||||
[assembly: AssemblyCulture("")]
|
[assembly: AssemblyCulture("")]
|
|
@ -20,7 +20,7 @@ def main(args):
|
||||||
parser.add_argument('--model', required=True,
|
parser.add_argument('--model', required=True,
|
||||||
help='Path to directory that contains all model files (output_graph and scorer)')
|
help='Path to directory that contains all model files (output_graph and scorer)')
|
||||||
parser.add_argument('--stream', required=False, action='store_true',
|
parser.add_argument('--stream', required=False, action='store_true',
|
||||||
help='To use deepspeech streaming interface')
|
help='To use mozilla voice stt streaming interface')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.stream is True:
|
if args.stream is True:
|
||||||
print("Opening mic for streaming")
|
print("Opening mic for streaming")
|
||||||
|
@ -52,7 +52,7 @@ def main(args):
|
||||||
logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt")
|
logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt")
|
||||||
|
|
||||||
for i, segment in enumerate(segments):
|
for i, segment in enumerate(segments):
|
||||||
# Run deepspeech on the chunk that just completed VAD
|
# Run mozilla voice stt on the chunk that just completed VAD
|
||||||
logging.debug("Processing chunk %002d" % (i,))
|
logging.debug("Processing chunk %002d" % (i,))
|
||||||
audio = np.frombuffer(segment, dtype=np.int16)
|
audio = np.frombuffer(segment, dtype=np.int16)
|
||||||
output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
|
output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
|
||||||
|
|
|
@ -93,7 +93,7 @@ class App(QMainWindow):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.title = 'Deepspeech Transcriber'
|
self.title = 'Mozilla Voice STT Transcriber'
|
||||||
self.left = 10
|
self.left = 10
|
||||||
self.top = 10
|
self.top = 10
|
||||||
self.width = 480
|
self.width = 480
|
||||||
|
@ -114,7 +114,7 @@ class App(QMainWindow):
|
||||||
self.browseButton = QPushButton('Browse', self)
|
self.browseButton = QPushButton('Browse', self)
|
||||||
self.browseButton.setToolTip('Select a wav file')
|
self.browseButton.setToolTip('Select a wav file')
|
||||||
self.modelsButton = QPushButton('Browse', self)
|
self.modelsButton = QPushButton('Browse', self)
|
||||||
self.modelsButton.setToolTip('Select deepspeech models folder')
|
self.modelsButton.setToolTip('Select models folder')
|
||||||
self.transcribeWav = QPushButton('Transcribe Wav', self)
|
self.transcribeWav = QPushButton('Transcribe Wav', self)
|
||||||
self.transcribeWav.setToolTip('Start Wav Transcription')
|
self.transcribeWav.setToolTip('Start Wav Transcription')
|
||||||
self.openMicrophone = QPushButton('Start Speaking', self)
|
self.openMicrophone = QPushButton('Start Speaking', self)
|
||||||
|
@ -205,7 +205,7 @@ class App(QMainWindow):
|
||||||
@pyqtSlot()
|
@pyqtSlot()
|
||||||
def models_on_click(self):
|
def models_on_click(self):
|
||||||
logging.debug('Models Browse Button clicked')
|
logging.debug('Models Browse Button clicked')
|
||||||
self.dirName = QFileDialog.getExistingDirectory(self, "Select deepspeech models directory")
|
self.dirName = QFileDialog.getExistingDirectory(self, "Select models directory")
|
||||||
if self.dirName:
|
if self.dirName:
|
||||||
self.modelsBox.setText(self.dirName)
|
self.modelsBox.setText(self.dirName)
|
||||||
logging.debug(self.dirName)
|
logging.debug(self.dirName)
|
||||||
|
@ -309,10 +309,10 @@ class App(QMainWindow):
|
||||||
@param Context: Is a tuple containing three objects
|
@param Context: Is a tuple containing three objects
|
||||||
1. Speech samples, sctx
|
1. Speech samples, sctx
|
||||||
2. subprocess handle
|
2. subprocess handle
|
||||||
3. Deepspeech model object
|
3. Mozilla Voice STT model object
|
||||||
'''
|
'''
|
||||||
def micWorker(self, context, progress_callback):
|
def micWorker(self, context, progress_callback):
|
||||||
# Deepspeech Streaming will be run from this method
|
# Mozilla Voice STT Streaming will be run from this method
|
||||||
logging.debug("Recording from your microphone")
|
logging.debug("Recording from your microphone")
|
||||||
while (not self.openMicrophone.isChecked()):
|
while (not self.openMicrophone.isChecked()):
|
||||||
data = context[1].stdout.read(512)
|
data = context[1].stdout.read(512)
|
||||||
|
@ -343,7 +343,7 @@ class App(QMainWindow):
|
||||||
self.show()
|
self.show()
|
||||||
|
|
||||||
def wavWorker(self, waveFile, progress_callback):
|
def wavWorker(self, waveFile, progress_callback):
|
||||||
# Deepspeech will be run from this method
|
# Mozilla Voice STT will be run from this method
|
||||||
logging.debug("Preparing for transcription...")
|
logging.debug("Preparing for transcription...")
|
||||||
inference_time = 0.0
|
inference_time = 0.0
|
||||||
|
|
||||||
|
@ -353,7 +353,7 @@ class App(QMainWindow):
|
||||||
logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt")
|
logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt")
|
||||||
|
|
||||||
for i, segment in enumerate(segments):
|
for i, segment in enumerate(segments):
|
||||||
# Run deepspeech on the chunk that just completed VAD
|
# Run mozilla voice stt on the chunk that just completed VAD
|
||||||
logging.debug("Processing chunk %002d" % (i,))
|
logging.debug("Processing chunk %002d" % (i,))
|
||||||
audio = np.frombuffer(segment, dtype=np.int16)
|
audio = np.frombuffer(segment, dtype=np.int16)
|
||||||
output = wavTranscriber.stt(self.model[0], audio, sample_rate)
|
output = wavTranscriber.stt(self.model[0], audio, sample_rate)
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
deepspeech==0.8.0
|
mozilla_voice_stt==0.9.0a5
|
||||||
webrtcvad
|
webrtcvad
|
||||||
pyqt5
|
pyqt5
|
||||||
|
|
|
@ -8,7 +8,7 @@ pushd ${THIS}
|
||||||
source ../tests.sh
|
source ../tests.sh
|
||||||
|
|
||||||
pip install --user $(get_python_wheel_url "$1")
|
pip install --user $(get_python_wheel_url "$1")
|
||||||
pip install --user -r <(grep -v deepspeech requirements.txt)
|
pip install --user -r <(grep -v mozilla_voice_stt requirements.txt)
|
||||||
|
|
||||||
python audioTranscript_cmd.py \
|
python audioTranscript_cmd.py \
|
||||||
--audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
|
--audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
|
||||||
|
|
|
@ -11,24 +11,24 @@ Load the pre-trained model into the memory
|
||||||
@param scorer: Scorer file
|
@param scorer: Scorer file
|
||||||
|
|
||||||
@Retval
|
@Retval
|
||||||
Returns a list [DeepSpeech Object, Model Load Time, Scorer Load Time]
|
Returns a list [MozillaVoiceStt Object, Model Load Time, Scorer Load Time]
|
||||||
'''
|
'''
|
||||||
def load_model(models, scorer):
|
def load_model(models, scorer):
|
||||||
model_load_start = timer()
|
model_load_start = timer()
|
||||||
ds = mozilla_voice_stt.Model(models)
|
mvs = mozilla_voice_stt.Model(models)
|
||||||
model_load_end = timer() - model_load_start
|
model_load_end = timer() - model_load_start
|
||||||
logging.debug("Loaded model in %0.3fs." % (model_load_end))
|
logging.debug("Loaded model in %0.3fs." % (model_load_end))
|
||||||
|
|
||||||
scorer_load_start = timer()
|
scorer_load_start = timer()
|
||||||
ds.enableExternalScorer(scorer)
|
mvs.enableExternalScorer(scorer)
|
||||||
scorer_load_end = timer() - scorer_load_start
|
scorer_load_end = timer() - scorer_load_start
|
||||||
logging.debug('Loaded external scorer in %0.3fs.' % (scorer_load_end))
|
logging.debug('Loaded external scorer in %0.3fs.' % (scorer_load_end))
|
||||||
|
|
||||||
return [ds, model_load_end, scorer_load_end]
|
return [mvs, model_load_end, scorer_load_end]
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Run Inference on input audio file
|
Run Inference on input audio file
|
||||||
@param ds: Deepspeech object
|
@param mvs: mozilla voice stt object
|
||||||
@param audio: Input audio for running inference on
|
@param audio: Input audio for running inference on
|
||||||
@param fs: Sample rate of the input audio file
|
@param fs: Sample rate of the input audio file
|
||||||
|
|
||||||
|
@ -36,14 +36,14 @@ Run Inference on input audio file
|
||||||
Returns a list [Inference, Inference Time, Audio Length]
|
Returns a list [Inference, Inference Time, Audio Length]
|
||||||
|
|
||||||
'''
|
'''
|
||||||
def stt(ds, audio, fs):
|
def stt(mvs, audio, fs):
|
||||||
inference_time = 0.0
|
inference_time = 0.0
|
||||||
audio_length = len(audio) * (1 / fs)
|
audio_length = len(audio) * (1 / fs)
|
||||||
|
|
||||||
# Run Deepspeech
|
# Run mozilla voice stt
|
||||||
logging.debug('Running inference...')
|
logging.debug('Running inference...')
|
||||||
inference_start = timer()
|
inference_start = timer()
|
||||||
output = ds.stt(audio)
|
output = mvs.stt(audio)
|
||||||
inference_end = timer() - inference_start
|
inference_end = timer() - inference_start
|
||||||
inference_time += inference_end
|
inference_time += inference_end
|
||||||
logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length))
|
logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length))
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Web Microphone Websocket
|
# Web Microphone Websocket
|
||||||
|
|
||||||
This is an example of a ReactJS web application streaming microphone audio from the browser
|
This is an example of a ReactJS web application streaming microphone audio from the browser
|
||||||
to a NodeJS server and transmitting the DeepSpeech results back to the browser.
|
to a NodeJS server and transmitting the Mozilla Voice STT results back to the browser.
|
||||||
|
|
||||||
#### Download the pre-trained model (1.8GB):
|
#### Download the pre-trained model (1.8GB):
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
"@testing-library/user-event": "^7.1.2",
|
"@testing-library/user-event": "^7.1.2",
|
||||||
"chai": "^4.2.0",
|
"chai": "^4.2.0",
|
||||||
"chai-http": "^4.3.0",
|
"chai-http": "^4.3.0",
|
||||||
"deepspeech": "^0.8.0",
|
"@mozilla-voice/stt": "^0.9.0-alpha.5",
|
||||||
"defaults": "^1.0.3",
|
"defaults": "^1.0.3",
|
||||||
"mocha": "^6.1.4",
|
"mocha": "^6.1.4",
|
||||||
"node-vad": "^1.1.4",
|
"node-vad": "^1.1.4",
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
work correctly both with client-side routing and a non-root public URL.
|
work correctly both with client-side routing and a non-root public URL.
|
||||||
Learn how to configure a non-root public URL by running `npm run build`.
|
Learn how to configure a non-root public URL by running `npm run build`.
|
||||||
-->
|
-->
|
||||||
<title>DeepSpeech - Web Microphone Websocket Example</title>
|
<title>Mozilla Voice STT - Web Microphone Websocket Example</title>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<noscript>You need to enable JavaScript to run this app.</noscript>
|
<noscript>You need to enable JavaScript to run this app.</noscript>
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
const http = require('http');
|
const http = require('http');
|
||||||
const socketIO = require('socket.io');
|
const socketIO = require('socket.io');
|
||||||
const DeepSpeech = require('deepspeech');
|
const mozillaVoiceStt = require('@mozilla-voice/stt');
|
||||||
const VAD = require('node-vad');
|
const VAD = require('node-vad');
|
||||||
|
|
||||||
let DEEPSPEECH_MODEL = __dirname + '/deepspeech-0.8.0-models'; // path to deepspeech english model directory
|
let DEEPSPEECH_MODEL = __dirname + '/deepspeech-0.8.0-models'; // path to english model directory
|
||||||
|
|
||||||
let SILENCE_THRESHOLD = 200; // how many milliseconds of inactivity before processing the audio
|
let SILENCE_THRESHOLD = 200; // how many milliseconds of inactivity before processing the audio
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ const vad = new VAD(VAD_MODE);
|
||||||
function createModel(modelDir) {
|
function createModel(modelDir) {
|
||||||
let modelPath = modelDir + '.pbmm';
|
let modelPath = modelDir + '.pbmm';
|
||||||
let scorerPath = modelDir + '.scorer';
|
let scorerPath = modelDir + '.scorer';
|
||||||
let model = new DeepSpeech.Model(modelPath);
|
let model = new mozillaVoiceStt.Model(modelPath);
|
||||||
model.enableExternalScorer(scorerPath);
|
model.enableExternalScorer(scorerPath);
|
||||||
return model;
|
return model;
|
||||||
}
|
}
|
||||||
|
|