WIP: Adding DeepSpeech w/ TFLite
This commit is contained in:
Родитель
ca9be29bac
Коммит
77665e35b7
|
@ -9,6 +9,9 @@ android {
|
|||
versionCode 1
|
||||
versionName "1.0"
|
||||
testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
|
||||
ndk {
|
||||
abiFilters 'armeabi-v7a', 'arm64-v8a', 'x86_64'
|
||||
}
|
||||
}
|
||||
buildTypes {
|
||||
release {
|
||||
|
@ -20,6 +23,10 @@ android {
|
|||
sourceCompatibility JavaVersion.VERSION_1_8
|
||||
targetCompatibility JavaVersion.VERSION_1_8
|
||||
}
|
||||
|
||||
lintOptions {
|
||||
abortOnError false
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
|
@ -28,6 +35,7 @@ dependencies {
|
|||
implementation 'com.android.support:appcompat-v7:27.1.1'
|
||||
implementation 'com.android.support.constraint:constraint-layout:1.1.2'
|
||||
implementation 'com.jjoe64:graphview:4.2.2'
|
||||
implementation 'net.lingala.zip4j:zip4j:1.3.2'
|
||||
testImplementation 'junit:junit:4.12'
|
||||
androidTestImplementation 'com.android.support.test:runner:1.0.2'
|
||||
androidTestImplementation 'com.android.support.test.espresso:espresso-core:3.0.2'
|
||||
|
|
|
@ -1,32 +1,58 @@
|
|||
package com.mozilla.speechapp;
|
||||
|
||||
import android.Manifest;
|
||||
|
||||
import android.app.Activity;
|
||||
import android.app.DownloadManager;
|
||||
|
||||
import android.content.pm.PackageManager;
|
||||
import android.content.BroadcastReceiver;
|
||||
import android.content.Context;
|
||||
import android.content.Intent;
|
||||
import android.content.IntentFilter;
|
||||
|
||||
import android.database.Cursor;
|
||||
|
||||
import android.support.annotation.NonNull;
|
||||
import android.support.v4.app.ActivityCompat;
|
||||
import android.support.v7.app.AppCompatActivity;
|
||||
|
||||
import android.net.Uri;
|
||||
|
||||
import android.os.Bundle;
|
||||
import android.os.AsyncTask;
|
||||
|
||||
import android.util.Log;
|
||||
|
||||
import android.view.View;
|
||||
import android.view.WindowManager;
|
||||
|
||||
import android.widget.Button;
|
||||
import android.widget.CompoundButton;
|
||||
import android.widget.EditText;
|
||||
import android.widget.Switch;
|
||||
import android.widget.Toast;
|
||||
|
||||
import com.jjoe64.graphview.GraphView;
|
||||
import com.jjoe64.graphview.series.DataPoint;
|
||||
import com.jjoe64.graphview.series.LineGraphSeries;
|
||||
|
||||
import com.mozilla.speechlibrary.ISpeechRecognitionListener;
|
||||
import com.mozilla.speechlibrary.MozillaSpeechService;
|
||||
import com.mozilla.speechlibrary.STTResult;
|
||||
import com.mozilla.speechmodule.R;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import net.lingala.zip4j.core.ZipFile;
|
||||
|
||||
import static android.support.constraint.Constraints.TAG;
|
||||
|
||||
public class MainActivity extends AppCompatActivity implements ISpeechRecognitionListener, CompoundButton.OnCheckedChangeListener {
|
||||
|
||||
private static long sDownloadId;
|
||||
private static DownloadManager sDownloadManager;
|
||||
|
||||
private MozillaSpeechService mMozillaSpeechService;
|
||||
private GraphView mGraph;
|
||||
private long mDtstart;
|
||||
|
@ -47,6 +73,7 @@ public class MainActivity extends AppCompatActivity implements ISpeechRecognitio
|
|||
EditText txtProdutTag, txtLanguage;
|
||||
Switch switchTranscriptions = findViewById(R.id.switchTranscriptions);
|
||||
Switch switchSamples = findViewById(R.id.switchSamples);
|
||||
Switch useDeepSpeech = findViewById(R.id.useDeepSpeech);
|
||||
|
||||
if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO)
|
||||
!= PackageManager.PERMISSION_GRANTED) {
|
||||
|
@ -75,7 +102,12 @@ public class MainActivity extends AppCompatActivity implements ISpeechRecognitio
|
|||
mSeries1.resetData(new DataPoint[0]);
|
||||
mMozillaSpeechService.setLanguage(txtLanguage.getText().toString());
|
||||
mMozillaSpeechService.setProductTag(txtProdutTag.getText().toString());
|
||||
mMozillaSpeechService.start(getApplicationContext());
|
||||
mMozillaSpeechService.setModelPath(getExternalFilesDir("models").getAbsolutePath());
|
||||
if (mMozillaSpeechService.ensureModelInstalled()) {
|
||||
mMozillaSpeechService.start(getApplicationContext());
|
||||
} else {
|
||||
maybeDownloadOrExtractModel(getExternalFilesDir("models").getAbsolutePath(), mMozillaSpeechService.getLanguageDir());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Log.d(TAG, e.getLocalizedMessage());
|
||||
e.printStackTrace();
|
||||
|
@ -93,8 +125,10 @@ public class MainActivity extends AppCompatActivity implements ISpeechRecognitio
|
|||
|
||||
switchTranscriptions.setOnCheckedChangeListener(this);
|
||||
switchSamples.setOnCheckedChangeListener(this);
|
||||
useDeepSpeech.setOnCheckedChangeListener(this);
|
||||
switchTranscriptions.toggle();
|
||||
switchSamples.toggle();
|
||||
useDeepSpeech.toggle();
|
||||
|
||||
mGraph = findViewById(R.id.graph);
|
||||
mSeries1 = new LineGraphSeries<>(new DataPoint[0]);
|
||||
|
@ -154,8 +188,89 @@ public class MainActivity extends AppCompatActivity implements ISpeechRecognitio
|
|||
public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
|
||||
if (buttonView.equals(findViewById(R.id.switchTranscriptions))) {
|
||||
mMozillaSpeechService.storeTranscriptions(isChecked);
|
||||
} else {
|
||||
} else if (buttonView.equals(findViewById(R.id.switchSamples))) {
|
||||
mMozillaSpeechService.storeSamples(isChecked);
|
||||
} else if (buttonView.equals(findViewById(R.id.useDeepSpeech))) {
|
||||
mMozillaSpeechService.useDeepSpeech(isChecked);
|
||||
}
|
||||
}
|
||||
|
||||
private class AsyncUnzip extends AsyncTask<String, Void, Boolean> {
|
||||
|
||||
@Override
|
||||
protected void onPreExecute() {
|
||||
Toast noModel = Toast.makeText(getApplicationContext(), "Extracting downloaded model", Toast.LENGTH_LONG);
|
||||
mPlain_text_input.append("Extracting downloaded model\n");
|
||||
noModel.show();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Boolean doInBackground(String...params) {
|
||||
String aZipFile = params[0], aRootModelsPath = params[1];
|
||||
try {
|
||||
ZipFile zf = new ZipFile(aZipFile);
|
||||
zf.extractAll(aRootModelsPath);
|
||||
} catch (Exception e) {
|
||||
Log.d(TAG, e.getLocalizedMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return (new File(aZipFile)).delete();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onPostExecute(Boolean result) {
|
||||
Button buttonStart = findViewById(R.id.button_start), buttonCancel = findViewById(R.id.button_cancel);
|
||||
mMozillaSpeechService.start(getApplicationContext());
|
||||
buttonStart.setEnabled(true);
|
||||
buttonCancel.setEnabled(true);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void maybeDownloadOrExtractModel(String aModelsPath, String aLang) {
|
||||
String zipFile = aModelsPath + "/" + aLang + ".zip";
|
||||
Uri modelZipURL = Uri.parse(mMozillaSpeechService.getModelDownloadURL());
|
||||
Uri modelZipFile = Uri.parse("file://" + zipFile);
|
||||
|
||||
Button buttonStart = findViewById(R.id.button_start), buttonCancel = findViewById(R.id.button_cancel);
|
||||
buttonStart.setEnabled(false);
|
||||
buttonCancel.setEnabled(false);
|
||||
|
||||
BroadcastReceiver receiver = new BroadcastReceiver() {
|
||||
@Override
|
||||
public void onReceive(Context context, Intent intent) {
|
||||
String action = intent.getAction();
|
||||
if (DownloadManager.ACTION_DOWNLOAD_COMPLETE.equals(action)) {
|
||||
long downloadId = intent.getLongExtra(DownloadManager.EXTRA_DOWNLOAD_ID, 0);
|
||||
DownloadManager.Query query = new DownloadManager.Query();
|
||||
query.setFilterById(downloadId);
|
||||
Cursor c = sDownloadManager.query(query);
|
||||
if (c.moveToFirst()) {
|
||||
int columnIndex = c.getColumnIndex(DownloadManager.COLUMN_STATUS);
|
||||
if (DownloadManager.STATUS_SUCCESSFUL == c.getInt(columnIndex)) {
|
||||
Log.d(TAG, "Download successfull");
|
||||
|
||||
new AsyncUnzip().execute(zipFile, aModelsPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Toast noModel = Toast.makeText(getApplicationContext(), "No model has been found for language '" + aLang + "'. Triggering download ...", Toast.LENGTH_LONG);
|
||||
mPlain_text_input.append("No model has been found for language '" + aLang + "'. Triggering download ...\n");
|
||||
noModel.show();
|
||||
|
||||
sDownloadManager = (DownloadManager) getSystemService(Context.DOWNLOAD_SERVICE);
|
||||
DownloadManager.Request request = new DownloadManager.Request(modelZipURL);
|
||||
request.setTitle("DeepSpeech " + aLang);
|
||||
request.setDescription("DeepSpeech Model");
|
||||
request.setNotificationVisibility(DownloadManager.Request.VISIBILITY_VISIBLE_NOTIFY_COMPLETED);
|
||||
request.setVisibleInDownloadsUi(false);
|
||||
request.setDestinationUri(modelZipFile);
|
||||
sDownloadId = sDownloadManager.enqueue(request);
|
||||
|
||||
getApplicationContext().registerReceiver(receiver, new IntentFilter(DownloadManager.ACTION_DOWNLOAD_COMPLETE));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,12 +49,21 @@
|
|||
android:inputType="textMultiLine"
|
||||
android:singleLine="false" />
|
||||
|
||||
<Switch
|
||||
android:id="@+id/useDeepSpeech"
|
||||
android:layout_width="wrap_content"
|
||||
android:layout_height="wrap_content"
|
||||
android:layout_alignParentEnd="true"
|
||||
android:layout_below="@+id/switchTranscriptions"
|
||||
android:text="Use DeepSpeech" />
|
||||
|
||||
<Switch
|
||||
android:id="@+id/switchTranscriptions"
|
||||
android:layout_width="wrap_content"
|
||||
android:layout_height="wrap_content"
|
||||
android:layout_alignParentEnd="true"
|
||||
android:layout_centerVertical="true"
|
||||
android:layout_below="@+id/switchSamples"
|
||||
android:text="Store Transcriptions" />
|
||||
|
||||
<Switch
|
||||
|
@ -71,7 +80,7 @@
|
|||
android:layout_height="wrap_content"
|
||||
android:layout_above="@+id/plain_text_input"
|
||||
android:layout_alignStart="@+id/graph"
|
||||
android:layout_marginBottom="-71dp"
|
||||
android:layout_marginBottom="-50dp"
|
||||
android:ems="10"
|
||||
android:inputType="textPersonName"
|
||||
android:text="ProductTag" />
|
||||
|
@ -82,9 +91,10 @@
|
|||
android:layout_height="wrap_content"
|
||||
android:layout_above="@+id/plain_text_input"
|
||||
android:layout_alignStart="@+id/graph"
|
||||
android:layout_marginBottom="-120dp"
|
||||
android:layout_marginBottom="-80dp"
|
||||
android:ems="10"
|
||||
android:inputType="textPersonName"
|
||||
android:text="Language" />
|
||||
android:text="eng" />
|
||||
|
||||
</RelativeLayout>
|
||||
</android.support.constraint.ConstraintLayout>
|
||||
</android.support.constraint.ConstraintLayout>
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
apply plugin: 'com.android.library'
|
||||
apply from: 'maven-push.gradle'
|
||||
|
||||
def versionMajor = 1
|
||||
def versionMajor = 2
|
||||
def versionMinor = 0
|
||||
def versionPatch = 4
|
||||
def versionPatch = 0
|
||||
|
||||
android {
|
||||
compileSdkVersion 25
|
||||
|
@ -13,6 +13,10 @@ android {
|
|||
versionCode versionMajor * 10000 + versionMinor * 100 + versionPatch
|
||||
versionName "${versionMajor}.${versionMinor}.${versionPatch}"
|
||||
testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
|
||||
|
||||
ndk {
|
||||
abiFilters 'armeabi-v7a', 'arm64-v8a', 'x86_64'
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
|
@ -27,15 +31,21 @@ android {
|
|||
path 'src/main/cpp/Android.mk'
|
||||
}
|
||||
}
|
||||
|
||||
compileOptions {
|
||||
sourceCompatibility = 1.7
|
||||
targetCompatibility = 1.7
|
||||
}
|
||||
|
||||
lintOptions {
|
||||
abortOnError false
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation fileTree(include: ['*.jar'], dir: 'libs')
|
||||
implementation 'com.loopj.android:android-async-http:1.4.9'
|
||||
implementation 'org.mozilla.deepspeech:libdeepspeech:0.5.0-alpha.1@aar'
|
||||
testImplementation 'junit:junit:4.12'
|
||||
androidTestImplementation 'com.android.support.test:runner:1.0.2'
|
||||
androidTestImplementation 'com.android.support.test.espresso:espresso-core:3.0.2'
|
||||
|
|
|
@ -0,0 +1,341 @@
|
|||
package com.mozilla.speechlibrary;
|
||||
|
||||
import android.content.Context;
|
||||
import android.media.AudioRecord;
|
||||
import android.os.Process;
|
||||
import com.github.axet.audiolibrary.encoders.Sound;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.File;
|
||||
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.ShortBuffer;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
import android.util.Log;
|
||||
|
||||
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
|
||||
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechStreamingState;
|
||||
|
||||
class LocalDSInference implements Runnable {
|
||||
|
||||
DeepSpeechModel mModel;
|
||||
DeepSpeechStreamingState mStreamingState;
|
||||
MozillaSpeechService mService;
|
||||
|
||||
Queue<short[]> mBuffers = new ConcurrentLinkedQueue<short[]>();
|
||||
|
||||
boolean stopStream;
|
||||
|
||||
final int N_CEP = 26;
|
||||
final int N_CONTEXT = 9;
|
||||
final int BEAM_WIDTH = 250;
|
||||
|
||||
final float LM_WEIGHT = 0.75f;
|
||||
final float VALID_WORD_COUNT_WEIGHT = 1.85f;
|
||||
|
||||
static final String _tag = "LocalDSInference";
|
||||
|
||||
static boolean keepClips = false;
|
||||
static boolean useDecoder = false;
|
||||
static int clipNumber = 0;
|
||||
FileChannel clipDebug;
|
||||
|
||||
String modelRoot;
|
||||
String tfliteModel;
|
||||
String alphabet;
|
||||
String LM;
|
||||
String trie;
|
||||
|
||||
protected LocalDSInference(MozillaSpeechService aService, int aFrameSize, int aSampleRate) {
|
||||
Log.e(this._tag, "new LocalDSInference()");
|
||||
|
||||
modelRoot = aService.getModelPath() + "/" + aService.getLanguageDir();
|
||||
|
||||
Log.e(this._tag, "Loading model from " + modelRoot);
|
||||
|
||||
this.tfliteModel = this.modelRoot + "/" + LocalSpeechRecognition.kTfLiteModel;
|
||||
this.alphabet = this.modelRoot + "/" + LocalSpeechRecognition.kAlphabet;
|
||||
this.LM = this.modelRoot + "/" + LocalSpeechRecognition.kLM;
|
||||
this.trie = this.modelRoot + "/" + LocalSpeechRecognition.kTrie;
|
||||
|
||||
this.clipNumber += 1;
|
||||
|
||||
this.keepClips = (new File(this.modelRoot + "/.keepClips")).exists();
|
||||
this.useDecoder = (new File(this.modelRoot + "/.useDecoder")).exists();
|
||||
|
||||
Log.e(this._tag, "keepClips=" + this.keepClips);
|
||||
Log.e(this._tag, "useDecoder=" + this.useDecoder);
|
||||
|
||||
this.mService = aService;
|
||||
|
||||
if (this.mModel == null) {
|
||||
Log.e(this._tag, "new DeepSpeechModel(\"" + this.tfliteModel + "\")");
|
||||
this.mModel = new DeepSpeechModel(this.tfliteModel, N_CEP, N_CONTEXT, this.alphabet, BEAM_WIDTH);
|
||||
}
|
||||
|
||||
if (this.useDecoder) {
|
||||
this.mModel.enableDecoderWihLM(this.alphabet, this.LM, this.trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT);
|
||||
}
|
||||
|
||||
if (this.keepClips) {
|
||||
try {
|
||||
this.clipDebug = new FileOutputStream(this.modelRoot + "/clip_" + this.clipNumber + ".wav").getChannel();
|
||||
} catch (Exception ex) {
|
||||
}
|
||||
}
|
||||
|
||||
this.mStreamingState = this.mModel.setupStream(aFrameSize * 2, aSampleRate);
|
||||
this.stopStream = false;
|
||||
}
|
||||
|
||||
public void closeModel() {
|
||||
Log.e(this._tag, "closeModel()");
|
||||
|
||||
if (this.mStreamingState != null) {
|
||||
String _ = this.mModel.finishStream(this.mStreamingState);
|
||||
}
|
||||
|
||||
if (this.mModel != null) {
|
||||
Log.e(this._tag, "closeModel()");
|
||||
this.mModel.destroyModel();
|
||||
}
|
||||
|
||||
this.mStreamingState = null;
|
||||
this.mModel = null;
|
||||
}
|
||||
|
||||
public void appendAudio(short[] aBuffer) {
|
||||
Log.e(this._tag, "appendAudio()");
|
||||
if (!this.stopStream) {
|
||||
// Log.e(this._tag, "appendAudio()::add");
|
||||
this.mBuffers.add(aBuffer);
|
||||
|
||||
if (this.keepClips) {
|
||||
// DEBUG
|
||||
ByteBuffer myByteBuffer = ByteBuffer.allocate(aBuffer.length * 2);
|
||||
myByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
ShortBuffer myShortBuffer = myByteBuffer.asShortBuffer();
|
||||
myShortBuffer.put(aBuffer);
|
||||
|
||||
try {
|
||||
this.clipDebug.write(myByteBuffer);
|
||||
} catch (Exception ex) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endOfStream() {
|
||||
Log.e(this._tag, "endOfStream()");
|
||||
this.stopStream = true;
|
||||
if (this.keepClips) {
|
||||
try {
|
||||
this.clipDebug.close();
|
||||
} catch (Exception ex) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void run() {
|
||||
Log.e(this._tag, "run()");
|
||||
|
||||
while ((!this.stopStream) || (this.mBuffers.size() > 0)) {
|
||||
short[] aBuffer = this.mBuffers.poll();
|
||||
|
||||
if (aBuffer == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
this.mModel.feedAudioContent(this.mStreamingState, aBuffer, aBuffer.length);
|
||||
}
|
||||
|
||||
Log.e(this._tag, "finishStream()");
|
||||
mService.notifyListeners(MozillaSpeechService.SpeechState.DECODING, null);
|
||||
String finalDecoded = this.mModel.finishStream(this.mStreamingState);
|
||||
Log.e(this._tag, "finalDecoded(" + finalDecoded.length() + ")=" + finalDecoded);
|
||||
this.mStreamingState = null;
|
||||
|
||||
STTResult sttResult = new STTResult(finalDecoded, (float)(1.0));
|
||||
mService.notifyListeners(MozillaSpeechService.SpeechState.STT_RESULT, sttResult);
|
||||
}
|
||||
}
|
||||
|
||||
class LocalSpeechRecognition implements Runnable {
|
||||
|
||||
Vad mVad;
|
||||
boolean done;
|
||||
boolean cancelled;
|
||||
|
||||
int mMinimumVoice = 150;
|
||||
int mMaximumSilence = 500;
|
||||
int mUpperLimit = 10;
|
||||
|
||||
static final int FRAME_SIZE = 80;
|
||||
|
||||
int mSampleRate;
|
||||
int mChannels;
|
||||
MozillaSpeechService mService;
|
||||
|
||||
static final String _tag = "LocalSpeechRecognition";
|
||||
|
||||
LocalDSInference mInferer;
|
||||
Thread mInferenceThread;
|
||||
|
||||
public static String kTfLiteModel = "output_graph.tflite";
|
||||
public static String kAlphabet = "alphabet.txt";
|
||||
public static String kLM = "lm.binary";
|
||||
public static String kTrie = "trie";
|
||||
|
||||
private static Map<String,String> mLanguages = new HashMap<String, String>();
|
||||
static {
|
||||
mLanguages.put("en-US", "eng");
|
||||
mLanguages.put("fr-FR", "fra");
|
||||
}
|
||||
|
||||
private static String kBaseModelURL = "https://github.com/lissyx/DeepSpeech/releases/download/android-test/";
|
||||
|
||||
protected LocalSpeechRecognition(int aSampleRate, int aChannels, Vad aVad,
|
||||
MozillaSpeechService aService) {
|
||||
Log.e(this._tag, "new LocalSpeechRecognition()");
|
||||
this.mVad = aVad;
|
||||
this.mSampleRate = aSampleRate;
|
||||
this.mChannels = aChannels;
|
||||
this.mService = aService;
|
||||
|
||||
this.mInferer = new LocalDSInference(this.mService, FRAME_SIZE, this.mSampleRate);
|
||||
this.mInferenceThread = new Thread(this.mInferer);
|
||||
this.mInferenceThread.start();
|
||||
}
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
int vad = 0;
|
||||
|
||||
boolean finishedvoice = false;
|
||||
boolean touchedvoice = false;
|
||||
boolean touchedsilence = false;
|
||||
boolean raisenovoice = false;
|
||||
long samplesvoice = 0 ;
|
||||
long samplessilence = 0 ;
|
||||
long dtantes = System.currentTimeMillis();
|
||||
long dtantesmili = System.currentTimeMillis();
|
||||
|
||||
Process.setThreadPriority(Process.THREAD_PRIORITY_URGENT_AUDIO);
|
||||
AudioRecord recorder = Sound.getAudioRecord(mChannels, mSampleRate);
|
||||
recorder.startRecording();
|
||||
mService.notifyListeners(MozillaSpeechService.SpeechState.START_LISTEN, null);
|
||||
|
||||
while (!this.done && !this.cancelled) {
|
||||
int nshorts = 0 ;
|
||||
|
||||
short[] mBuftemp = new short[FRAME_SIZE * mChannels * 2];
|
||||
nshorts = recorder.read(mBuftemp, 0, mBuftemp.length);
|
||||
|
||||
vad = mVad.feed(mBuftemp, nshorts);
|
||||
double[] fft = Sound.fft(mBuftemp, 0, nshorts);
|
||||
double fftsum = Arrays.stream(fft).sum()/fft.length;
|
||||
mService.notifyListeners(MozillaSpeechService.SpeechState.MIC_ACTIVITY, fftsum);
|
||||
|
||||
long dtdepois = System.currentTimeMillis();
|
||||
|
||||
if (vad == 0) {
|
||||
if (touchedvoice) {
|
||||
samplessilence += dtdepois - dtantesmili;
|
||||
if (samplessilence > mMaximumSilence) touchedsilence = true;
|
||||
}
|
||||
} else { // vad == 1 => Active voice
|
||||
samplesvoice += dtdepois - dtantesmili;
|
||||
if (samplesvoice > mMinimumVoice) touchedvoice = true;
|
||||
|
||||
for (int i = 0; i < mBuftemp.length; ++i) {
|
||||
mBuftemp[i] *= 5.0;
|
||||
}
|
||||
}
|
||||
dtantesmili = dtdepois;
|
||||
|
||||
this.mInferer.appendAudio(mBuftemp);
|
||||
|
||||
if (touchedvoice && touchedsilence)
|
||||
finishedvoice = true;
|
||||
|
||||
if (finishedvoice) {
|
||||
this.done = true;
|
||||
this.mInferer.endOfStream();
|
||||
}
|
||||
|
||||
if ((dtdepois - dtantes)/1000 > mUpperLimit ) {
|
||||
this.done = true;
|
||||
if (touchedvoice) {
|
||||
this.mInferer.endOfStream();
|
||||
}
|
||||
else {
|
||||
raisenovoice = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (nshorts <= 0)
|
||||
break;
|
||||
}
|
||||
|
||||
mVad.stop();
|
||||
recorder.stop();
|
||||
recorder.release();
|
||||
|
||||
if (raisenovoice) mService.notifyListeners(MozillaSpeechService.SpeechState.NO_VOICE, null);
|
||||
|
||||
if (cancelled) {
|
||||
cancelled = false;
|
||||
mService.notifyListeners(MozillaSpeechService.SpeechState.CANCELED, null);
|
||||
return;
|
||||
}
|
||||
|
||||
} catch (Exception exc) {
|
||||
String error = String.format("General audio error %s", exc.getMessage());
|
||||
mService.notifyListeners(MozillaSpeechService.SpeechState.ERROR, error);
|
||||
exc.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public void cancel() {
|
||||
Log.e(this._tag, "cancel()");
|
||||
this.cancelled = true;
|
||||
this.done = true;
|
||||
|
||||
if (this.mInferer != null) {
|
||||
this.mInferer.closeModel();
|
||||
}
|
||||
}
|
||||
|
||||
public static String getLanguageDir(String aLanguage) {
|
||||
String rv = aLanguage;
|
||||
|
||||
if (rv.length() != 3) {
|
||||
if (mLanguages.containsKey(rv)) {
|
||||
rv = mLanguages.get(rv);
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
public static boolean ensureModelInstalled(String aModelPath) {
|
||||
Log.e(_tag, "ensureModelInstalled(" + aModelPath + ")");
|
||||
return (new File(aModelPath + "/" + kTfLiteModel)).exists()
|
||||
&& (new File(aModelPath + "/" + kAlphabet)).exists()
|
||||
&& (new File(aModelPath + "/" + kLM)).exists()
|
||||
&& (new File(aModelPath + "/" + kTrie)).exists();
|
||||
}
|
||||
|
||||
public static String getModelDownloadURL(String aLang) {
|
||||
return kBaseModelURL + aLang + ".zip";
|
||||
}
|
||||
}
|
|
@ -15,6 +15,8 @@ public class MozillaSpeechService {
|
|||
private Context mContext;
|
||||
private boolean isIdle = true;
|
||||
NetworkSettings mNetworkSettings;
|
||||
private boolean useDeepSpeech = false;
|
||||
private String mModelPath;
|
||||
|
||||
public enum SpeechState
|
||||
{
|
||||
|
@ -23,7 +25,8 @@ public class MozillaSpeechService {
|
|||
}
|
||||
|
||||
private static final MozillaSpeechService ourInstance = new MozillaSpeechService();
|
||||
private SpeechRecognition mSpeechRecognition;
|
||||
private NetworkSpeechRecognition mNetworkSpeechRecognition;
|
||||
private LocalSpeechRecognition mLocalSpeechRecognition;
|
||||
private SpeechState mState;
|
||||
private Vad mVad;
|
||||
|
||||
|
@ -47,15 +50,23 @@ public class MozillaSpeechService {
|
|||
if (retVal < 0) {
|
||||
notifyListeners(SpeechState.ERROR, "Error Initializing VAD " + String.valueOf(retVal));
|
||||
} else {
|
||||
this.mSpeechRecognition = new SpeechRecognition(SAMPLERATE, CHANNELS, mVad, aContext,
|
||||
this, mNetworkSettings);
|
||||
Thread audio_thread = new Thread(this.mSpeechRecognition);
|
||||
Thread audio_thread;
|
||||
|
||||
if (this.useDeepSpeech) {
|
||||
this.mLocalSpeechRecognition = new LocalSpeechRecognition(SAMPLERATE, CHANNELS, mVad, this);
|
||||
audio_thread = new Thread(this.mLocalSpeechRecognition);
|
||||
} else {
|
||||
this.mNetworkSpeechRecognition = new NetworkSpeechRecognition(SAMPLERATE, CHANNELS, mVad, aContext, this, mNetworkSettings);
|
||||
audio_thread = new Thread(this.mNetworkSpeechRecognition);
|
||||
}
|
||||
|
||||
audio_thread.start();
|
||||
isIdle = false;
|
||||
}
|
||||
}
|
||||
} catch (Exception exc) {
|
||||
notifyListeners(SpeechState.ERROR, "General error loading the module.");
|
||||
Log.e("MozillaSpeechService", "General error loading the module: " + exc);
|
||||
notifyListeners(SpeechState.ERROR, "General error loading the module: " + exc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -78,7 +89,10 @@ public class MozillaSpeechService {
|
|||
}
|
||||
|
||||
public void cancel() {
|
||||
this.mSpeechRecognition.cancel();
|
||||
// this.mNetworkSpeechRecognition.cancel();
|
||||
if (this.mLocalSpeechRecognition != null) {
|
||||
this.mLocalSpeechRecognition.cancel();
|
||||
}
|
||||
}
|
||||
|
||||
public void removeListener(ISpeechRecognitionListener aListener) {
|
||||
|
@ -99,6 +113,31 @@ public class MozillaSpeechService {
|
|||
this.mNetworkSettings.mLanguage = language;
|
||||
}
|
||||
|
||||
public String getLanguageDir() {
|
||||
return LocalSpeechRecognition.getLanguageDir(this.mNetworkSettings.mLanguage);
|
||||
}
|
||||
|
||||
public void useDeepSpeech(boolean yesOrNo) {
|
||||
this.useDeepSpeech = yesOrNo;
|
||||
}
|
||||
|
||||
public String getModelPath() {
|
||||
return this.mModelPath;
|
||||
}
|
||||
|
||||
// This sets model's root path, not including the language
|
||||
public void setModelPath(String aModelPath) {
|
||||
this.mModelPath = aModelPath;
|
||||
}
|
||||
|
||||
public boolean ensureModelInstalled() {
|
||||
return LocalSpeechRecognition.ensureModelInstalled(this.getModelPath() + "/" + this.getLanguageDir());
|
||||
}
|
||||
|
||||
public String getModelDownloadURL() {
|
||||
return LocalSpeechRecognition.getModelDownloadURL(this.getLanguageDir());
|
||||
}
|
||||
|
||||
public void setProductTag(String tag) {
|
||||
this.mNetworkSettings.mProductTag = tag;
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ import com.github.axet.audiolibrary.encoders.Sound;
|
|||
import java.io.ByteArrayOutputStream;
|
||||
import java.util.Arrays;
|
||||
|
||||
class SpeechRecognition implements Runnable {
|
||||
class NetworkSpeechRecognition implements Runnable {
|
||||
|
||||
Vad mVad;
|
||||
short[] mBuftemp;
|
||||
|
@ -29,7 +29,7 @@ class SpeechRecognition implements Runnable {
|
|||
Networking network;
|
||||
NetworkSettings mNetworkSettings;
|
||||
|
||||
protected SpeechRecognition(int aSampleRate, int aChannels, Vad aVad, Context aContext,
|
||||
protected NetworkSpeechRecognition(int aSampleRate, int aChannels, Vad aVad, Context aContext,
|
||||
MozillaSpeechService aService, NetworkSettings mNetworkSettings) {
|
||||
this.mVad = aVad;
|
||||
this.mContext = aContext;
|
Загрузка…
Ссылка в новой задаче