зеркало из https://github.com/mozilla/DeepSpeech.git
Address review comments and update docs
This commit is contained in:
Родитель
efbed73d5c
Коммит
1d3b3a31a1
|
@ -1,4 +1 @@
|
|||
*.binary filter=lfs diff=lfs merge=lfs -crlf
|
||||
data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
|
||||
data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text
|
||||
data/lm/kenlm.scorer filter=lfs diff=lfs merge=lfs -text
|
||||
|
|
|
@ -36,7 +36,7 @@ To install and use deepspeech all you have to do is:
|
|||
tar xvf audio-0.6.1.tar.gz
|
||||
|
||||
# Transcribe an audio file
|
||||
deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --lm deepspeech-0.6.1-models/lm.binary --trie deepspeech-0.6.1-models/trie --audio audio/2830-3980-0043.wav
|
||||
deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --scorer deepspeech-0.6.1-models/kenlm.scorer --audio audio/2830-3980-0043.wav
|
||||
|
||||
A pre-trained English model is available for use and can be downloaded using `the instructions below <doc/USING.rst#using-a-pre-trained-model>`_. A package with some example audio files is available for download in our `release notes <https://github.com/mozilla/DeepSpeech/releases/latest>`_.
|
||||
|
||||
|
@ -52,7 +52,7 @@ Quicker inference can be performed using a supported NVIDIA GPU on Linux. See th
|
|||
pip3 install deepspeech-gpu
|
||||
|
||||
# Transcribe an audio file.
|
||||
deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --lm deepspeech-0.6.1-models/lm.binary --trie deepspeech-0.6.1-models/trie --audio audio/2830-3980-0043.wav
|
||||
deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --scorer deepspeech-0.6.1-models/kenlm.scorer --audio audio/2830-3980-0043.wav
|
||||
|
||||
Please ensure you have the required `CUDA dependencies <doc/USING.rst#cuda-dependency>`_.
|
||||
|
||||
|
|
|
@ -5,9 +5,7 @@ This directory contains language-specific data files. Most importantly, you will
|
|||
|
||||
1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt`
|
||||
|
||||
2. A binary n-gram language model compiled by `kenlm` in `data/lm/lm.binary`
|
||||
|
||||
3. A trie model compiled by `generate_trie <https://github.com/mozilla/DeepSpeech#using-the-command-line-client>`_ in `data/lm/trie`
|
||||
2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`, which includes a binary n-gram language model generated with `data/lm/generate_lm.py`.
|
||||
|
||||
For more information on how to build these resources from scratch, see `data/lm/README.md`
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
|
||||
lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
|
||||
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
|
||||
|
||||
The trie was then generated from the vocabulary of the language model:
|
||||
The scorer package was then built using the `generate_package.py` script:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./generate_trie ../data/alphabet.txt lm.binary trie
|
||||
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer
|
||||
|
|
|
@ -39,10 +39,13 @@ def main():
|
|||
'--prune', '0', '0', '1'
|
||||
])
|
||||
|
||||
# Filter LM using vocabulary of top 500k words
|
||||
filtered_path = os.path.join(tmp, 'lm_filtered.arpa')
|
||||
vocab_str = '\n'.join(word for word, count in counter.most_common(500000))
|
||||
with open('librispeech-vocab-500k.txt', 'w') as fout:
|
||||
fout.write(vocab_str)
|
||||
|
||||
# Filter LM using vocabulary of top 500k words
|
||||
print('Filtering ARPA file...')
|
||||
filtered_path = os.path.join(tmp, 'lm_filtered.arpa')
|
||||
subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True)
|
||||
|
||||
# Quantize and produce trie binary.
|
||||
|
|
|
@ -41,6 +41,7 @@ def create_bundle(
|
|||
|
||||
if force_utf8 != None: # pylint: disable=singleton-comparison
|
||||
use_utf8 = force_utf8.value
|
||||
print("Forcing UTF-8 mode = {}".format(use_utf8))
|
||||
else:
|
||||
use_utf8 = vocab_looks_char_based
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ Creating a model instance and loading model
|
|||
.. literalinclude:: ../native_client/client.cc
|
||||
:language: c
|
||||
:linenos:
|
||||
:lines: 370-388
|
||||
:lines: 370-390
|
||||
|
||||
Performing inference
|
||||
--------------------
|
||||
|
|
|
@ -7,6 +7,12 @@ Model
|
|||
.. js:autoclass:: Model
|
||||
:members:
|
||||
|
||||
Stream
|
||||
------
|
||||
|
||||
.. js:autoclass:: Stream
|
||||
:members:
|
||||
|
||||
Module exported methods
|
||||
-----------------------
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ Creating a model instance and loading model
|
|||
.. literalinclude:: ../native_client/javascript/client.js
|
||||
:language: javascript
|
||||
:linenos:
|
||||
:lines: 57-66
|
||||
:lines: 54-72
|
||||
|
||||
Performing inference
|
||||
--------------------
|
||||
|
@ -15,7 +15,7 @@ Performing inference
|
|||
.. literalinclude:: ../native_client/javascript/client.js
|
||||
:language: javascript
|
||||
:linenos:
|
||||
:lines: 115-117
|
||||
:lines: 117-121
|
||||
|
||||
Full source code
|
||||
----------------
|
||||
|
|
|
@ -9,6 +9,12 @@ Model
|
|||
.. autoclass:: Model
|
||||
:members:
|
||||
|
||||
Stream
|
||||
------
|
||||
|
||||
.. autoclass:: Stream
|
||||
:members:
|
||||
|
||||
Metadata
|
||||
--------
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ Creating a model instance and loading model
|
|||
.. literalinclude:: ../native_client/python/client.py
|
||||
:language: python
|
||||
:linenos:
|
||||
:lines: 69, 78
|
||||
:lines: 111, 120
|
||||
|
||||
Performing inference
|
||||
--------------------
|
||||
|
@ -15,7 +15,7 @@ Performing inference
|
|||
.. literalinclude:: ../native_client/python/client.py
|
||||
:language: python
|
||||
:linenos:
|
||||
:lines: 95-98
|
||||
:lines: 140-145
|
||||
|
||||
Full source code
|
||||
----------------
|
||||
|
|
|
@ -106,9 +106,9 @@ Note: the following command assumes you `downloaded the pre-trained model <#gett
|
|||
|
||||
.. code-block:: bash
|
||||
|
||||
deepspeech --model models/output_graph.pbmm --lm models/lm.binary --trie models/trie --audio my_audio_file.wav
|
||||
deepspeech --model models/output_graph.pbmm --scorer models/kenlm.scorer --audio my_audio_file.wav
|
||||
|
||||
The arguments ``--lm`` and ``--trie`` are optional, and represent a language model.
|
||||
The ``--scorer`` argument is optional, and represents an external language model to be used when transcribing the audio.
|
||||
|
||||
See :github:`client.py <native_client/python/client.py>` for an example of how to use the package programatically.
|
||||
|
||||
|
@ -162,7 +162,7 @@ Note: the following command assumes you `downloaded the pre-trained model <#gett
|
|||
|
||||
.. code-block:: bash
|
||||
|
||||
./deepspeech --model models/output_graph.pbmm --lm models/lm.binary --trie models/trie --audio audio_input.wav
|
||||
./deepspeech --model models/output_graph.pbmm --scorer models/kenlm.scorer --audio audio_input.wav
|
||||
|
||||
See the help output with ``./deepspeech -h`` and the :github:`native client README <native_client/README.rst>` for more details.
|
||||
|
||||
|
|
|
@ -59,11 +59,11 @@ void PrintHelp(const char* bin)
|
|||
|
||||
bool ProcessArgs(int argc, char** argv)
|
||||
{
|
||||
const char* const short_opts = "m:a:s:r:w:c:d:b:tehv";
|
||||
const char* const short_opts = "m:l:a:b:c:d:tejs:vh";
|
||||
const option long_opts[] = {
|
||||
{"model", required_argument, nullptr, 'm'},
|
||||
{"scorer", required_argument, nullptr, 'l'},
|
||||
{"audio", required_argument, nullptr, 'w'},
|
||||
{"audio", required_argument, nullptr, 'a'},
|
||||
{"beam_width", required_argument, nullptr, 'b'},
|
||||
{"lm_alpha", required_argument, nullptr, 'c'},
|
||||
{"lm_beta", required_argument, nullptr, 'd'},
|
||||
|
@ -71,8 +71,8 @@ bool ProcessArgs(int argc, char** argv)
|
|||
{"extended", no_argument, nullptr, 'e'},
|
||||
{"json", no_argument, nullptr, 'j'},
|
||||
{"stream", required_argument, nullptr, 's'},
|
||||
{"help", no_argument, nullptr, 'h'},
|
||||
{"version", no_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, 'h'},
|
||||
{nullptr, no_argument, nullptr, 0}
|
||||
};
|
||||
|
||||
|
@ -93,14 +93,14 @@ bool ProcessArgs(int argc, char** argv)
|
|||
scorer = optarg;
|
||||
break;
|
||||
|
||||
case 'w':
|
||||
case 'a':
|
||||
audio = optarg;
|
||||
break;
|
||||
|
||||
case 'b':
|
||||
beam_width = atoi(optarg);
|
||||
break;
|
||||
|
||||
|
||||
case 'c':
|
||||
set_alphabeta = true;
|
||||
lm_alpha = atof(optarg);
|
||||
|
@ -115,10 +115,6 @@ bool ProcessArgs(int argc, char** argv)
|
|||
show_times = true;
|
||||
break;
|
||||
|
||||
case 'v':
|
||||
has_versions = true;
|
||||
break;
|
||||
|
||||
case 'e':
|
||||
extended_metadata = true;
|
||||
break;
|
||||
|
@ -131,6 +127,10 @@ bool ProcessArgs(int argc, char** argv)
|
|||
stream_size = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'v':
|
||||
has_versions = true;
|
||||
break;
|
||||
|
||||
case 'h': // -h or --help
|
||||
case '?': // Unrecognized option
|
||||
default:
|
||||
|
|
|
@ -12,11 +12,11 @@ class Scorer(swigwrapper.Scorer):
|
|||
:type alpha: float
|
||||
:param beta: Word insertion bonus.
|
||||
:type beta: float
|
||||
:model_path: Path to load scorer.
|
||||
:scorer_path: Path to load scorer from.
|
||||
:alphabet: Alphabet
|
||||
:type model_path: basestring
|
||||
:type scorer_path: basestring
|
||||
"""
|
||||
def __init__(self, alpha=None, beta=None, model_path=None, alphabet=None):
|
||||
def __init__(self, alpha=None, beta=None, scorer_path=None, alphabet=None):
|
||||
super(Scorer, self).__init__()
|
||||
# Allow bare initialization
|
||||
if alphabet:
|
||||
|
@ -26,7 +26,7 @@ class Scorer(swigwrapper.Scorer):
|
|||
if err != 0:
|
||||
raise ValueError("Error when deserializing alphabet.")
|
||||
|
||||
err = self.init(model_path.encode('utf-8'),
|
||||
err = self.init(scorer_path.encode('utf-8'),
|
||||
native_alphabet)
|
||||
if err != 0:
|
||||
raise ValueError("Scorer initialization failed with error code {}".format(err), err)
|
||||
|
|
|
@ -36,7 +36,7 @@ DecoderState::init(const Alphabet& alphabet,
|
|||
prefix_root_.reset(root);
|
||||
prefixes_.push_back(root);
|
||||
|
||||
if (ext_scorer != nullptr && (bool)ext_scorer_->dictionary) {
|
||||
if (ext_scorer != nullptr && (bool)(ext_scorer_->dictionary)) {
|
||||
// no need for std::make_shared<>() since Copy() does 'new' behind the doors
|
||||
auto dict_ptr = std::shared_ptr<PathTrie::FstType>(ext_scorer->dictionary->Copy(true));
|
||||
root->set_dictionary(dict_ptr);
|
||||
|
|
|
@ -51,7 +51,7 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including:
|
|||
|
||||
|
||||
* ``output_graph.tflite`` which is the TF Lite model
|
||||
* ``lm.binary`` and ``trie`` files, if you want to use the language model ; please
|
||||
* ``kenlm.scorer``, if you want to use the language model ; please
|
||||
be aware that too big language model will make the device run out of memory
|
||||
|
||||
Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ :
|
||||
|
|
|
@ -123,6 +123,11 @@ Model.prototype.createStream = function() {
|
|||
return ctx;
|
||||
}
|
||||
|
||||
/**
|
||||
* @class
|
||||
* Provides an interface to a DeepSpeech stream. The constructor cannot be called
|
||||
* directly, use :js:func:`Model.createStream`.
|
||||
*/
|
||||
function Stream(nativeStream) {
|
||||
this._impl = nativeStream;
|
||||
}
|
||||
|
|
|
@ -131,6 +131,10 @@ class Model(object):
|
|||
|
||||
|
||||
class Stream(object):
|
||||
"""
|
||||
Class wrapping a DeepSpeech stream. The constructor cannot be called directly.
|
||||
Use :func:`Model.createStream()`
|
||||
"""
|
||||
def __init__(self, native_stream):
|
||||
self._impl = native_stream
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ then:
|
|||
DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz"
|
||||
PIP_DEFAULT_TIMEOUT: "60"
|
||||
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
|
||||
EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
|
||||
EXAMPLES_CHECKOUT_TARGET: "4b97ac41d03ca0d23fa92526433db72a90f47d4a"
|
||||
|
||||
command:
|
||||
- "/bin/bash"
|
||||
|
|
|
@ -44,7 +44,7 @@ payload:
|
|||
MSYS: 'winsymlinks:nativestrict'
|
||||
TENSORFLOW_BUILD_ARTIFACT: ${build.tensorflow}
|
||||
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
|
||||
EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
|
||||
EXAMPLES_CHECKOUT_TARGET: "4b97ac41d03ca0d23fa92526433db72a90f47d4a"
|
||||
|
||||
command:
|
||||
- >-
|
||||
|
|
Загрузка…
Ссылка в новой задаче