From e9ae38bf4789b9a2f62520c622c1eba1af656a9c Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 25 Feb 2020 15:43:36 +0100 Subject: [PATCH] Update docs --- doc/C-API.rst | 3 + doc/DotNet-API.rst | 19 ++-- doc/NodeJS-API.rst | 12 ++- doc/Python-API.rst | 12 ++- native_client/deepspeech.h | 27 +++--- .../dotnet/DeepSpeechClient/DeepSpeech.cs | 12 +-- .../Interfaces/IDeepSpeech.cs | 12 +-- .../libdeepspeech/DeepSpeechModel.java | 21 ++-- .../CandidateTranscript.java | 96 +++++++++++++++++++ .../libdeepspeech_doc/Metadata.java | 62 +++++------- .../libdeepspeech_doc/TokenMetadata.java | 79 +++++++++++++++ native_client/javascript/index.js | 49 +++++----- native_client/python/__init__.py | 30 +++--- 13 files changed, 314 insertions(+), 120 deletions(-) create mode 100644 native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java create mode 100644 native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java diff --git a/doc/C-API.rst b/doc/C-API.rst index 2506d9b2..2b0e7e05 100644 --- a/doc/C-API.rst +++ b/doc/C-API.rst @@ -34,6 +34,9 @@ C .. doxygenfunction:: DS_IntermediateDecode :project: deepspeech-c +.. doxygenfunction:: DS_IntermediateDecodeWithMetadata + :project: deepspeech-c + .. doxygenfunction:: DS_FinishStream :project: deepspeech-c diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst index 2ba3415f..d43c7afb 100644 --- a/doc/DotNet-API.rst +++ b/doc/DotNet-API.rst @@ -31,13 +31,20 @@ ErrorCodes Metadata -------- -.. doxygenstruct:: DeepSpeechClient::Structs::Metadata +.. doxygenstruct:: DeepSpeechClient::Models::Metadata :project: deepspeech-dotnet - :members: items, num_items, confidence + :members: Transcripts -MetadataItem ------------- +CandidateTranscript +------------------- -.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem +.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript :project: deepspeech-dotnet - :members: character, timestep, start_time + :members: Tokens, Confidence + +TokenMetadata +------------- + +.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata + :project: deepspeech-dotnet + :members: Text, Timestep, StartTime diff --git a/doc/NodeJS-API.rst b/doc/NodeJS-API.rst index aaba718c..b6170b5b 100644 --- a/doc/NodeJS-API.rst +++ b/doc/NodeJS-API.rst @@ -30,8 +30,14 @@ Metadata .. js:autoclass:: Metadata :members: -MetadataItem ------------- +CandidateTranscript +------------------- -.. js:autoclass:: MetadataItem +.. js:autoclass:: CandidateTranscript + :members: + +TokenMetadata +------------- + +.. js:autoclass:: TokenMetadata :members: diff --git a/doc/Python-API.rst b/doc/Python-API.rst index b2b3567f..9aec57f0 100644 --- a/doc/Python-API.rst +++ b/doc/Python-API.rst @@ -21,8 +21,14 @@ Metadata .. autoclass:: Metadata :members: -MetadataItem ------------- +CandidateTranscript +------------------- -.. autoclass:: MetadataItem +.. autoclass:: CandidateTranscript + :members: + +TokenMetadata +------------- + +.. autoclass:: TokenMetadata :members: diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index 8bfee073..bf4c0f00 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -42,20 +42,20 @@ typedef struct CandidateTranscript { TokenMetadata* tokens; /** Size of the tokens array */ int num_tokens; - /** Approximated confidence value for this transcription. This is roughly the + /** Approximated confidence value for this transcript. This is roughly the * sum of the acoustic model logit values for each timestep/character that - * contributed to the creation of this transcription. + * contributed to the creation of this transcript. */ double confidence; } CandidateTranscript; /** - * @brief An array of CandidateTranscript objects computed by the model + * @brief An array of CandidateTranscript objects computed by the model. */ typedef struct Metadata { /** Array of CandidateTranscript objects */ CandidateTranscript* transcripts; - /** Size of the transcriptions array */ + /** Size of the transcripts array */ int num_transcripts; } Metadata; @@ -191,14 +191,14 @@ char* DS_SpeechToText(ModelState* aCtx, unsigned int aBufferSize); /** - * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata - * about the results. + * @brief Use the DeepSpeech model to perform Speech-To-Text and output results + * including metadata. * * @param aCtx The ModelState pointer for the model to use. * @param aBuffer A 16-bit, mono raw audio signal at the appropriate * sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in the audio signal. - * @param aNumResults The number of candidate transcripts to return. + * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this. * * @return Metadata struct containing multiple candidate transcripts. Each transcript * has per-token metadata including timing information. The user is @@ -252,7 +252,7 @@ char* DS_IntermediateDecode(const StreamingState* aSctx); /** * @brief Compute the intermediate decoding of an ongoing streaming inference, - * returns per-letter metadata. + * return results including metadata. * * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aNumResults The number of candidate transcripts to return. @@ -267,8 +267,8 @@ Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx, unsigned int aNumResults); /** - * @brief Signal the end of an audio signal to an ongoing streaming - * inference, returns the STT result over the whole audio signal. + * @brief Compute the final decoding of an ongoing streaming inference and return + * the result. Signals the end of an ongoing streaming inference. * * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @@ -281,8 +281,9 @@ DEEPSPEECH_EXPORT char* DS_FinishStream(StreamingState* aSctx); /** - * @brief Signal the end of an audio signal to an ongoing streaming - * inference, returns per-letter metadata. + * @brief Compute the final decoding of an ongoing streaming inference and return + * results including metadata. Signals the end of an ongoing streaming + * inference. * * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aNumResults The number of candidate transcripts to return. @@ -295,7 +296,7 @@ char* DS_FinishStream(StreamingState* aSctx); * @note This method will free the state pointer (@p aSctx). */ DEEPSPEECH_EXPORT -Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, +Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, unsigned int aNumResults); /** diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index ce184cf4..3340c9b3 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -199,10 +199,10 @@ namespace DeepSpeechClient } /// - /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. + /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata. /// /// Instance of the stream to finish. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults) { @@ -220,10 +220,10 @@ namespace DeepSpeechClient } /// - /// Computes the intermediate decoding of an ongoing streaming inference. + /// Computes the intermediate decoding of an ongoing streaming inference, including metadata. /// /// Instance of the stream to decode. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The STT intermediate result. public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults) { @@ -273,11 +273,11 @@ namespace DeepSpeechClient } /// - /// Use the DeepSpeech model to perform Speech-To-Text. + /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata. Returns NULL on error. public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults) { diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index ae3e72cf..37d6ce59 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -68,11 +68,11 @@ namespace DeepSpeechClient.Interfaces uint aBufferSize); /// - /// Use the DeepSpeech model to perform Speech-To-Text. + /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata. Returns NULL on error. unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, @@ -105,10 +105,10 @@ namespace DeepSpeechClient.Interfaces unsafe string IntermediateDecode(DeepSpeechStream stream); /// - /// Computes the intermediate decoding of an ongoing streaming inference. + /// Computes the intermediate decoding of an ongoing streaming inference, including metadata. /// /// Instance of the stream to decode. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults); @@ -120,10 +120,10 @@ namespace DeepSpeechClient.Interfaces unsafe string FinishStream(DeepSpeechStream stream); /// - /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. + /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata. /// /// Instance of the stream to finish. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults); } diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java index b506b1d3..a5b339b3 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java @@ -117,9 +117,10 @@ public class DeepSpeechModel { * @param buffer A 16-bit, mono raw audio signal at the appropriate * sample rate (matching what the model was trained on). * @param buffer_size The number of samples in the audio signal. - * @param num_results Number of candidate transcripts to return. + * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this. * - * @return Outputs a Metadata object of individual letters along with their timing information. + * @return Metadata struct containing multiple candidate transcripts. Each transcript + * has per-token metadata including timing information. */ public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) { return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results); @@ -165,7 +166,7 @@ public class DeepSpeechModel { * @brief Compute the intermediate decoding of an ongoing streaming inference. * * @param ctx A streaming state pointer returned by createStream(). - * @param num_results Number of candidate transcripts to return. + * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this. * * @return The STT intermediate result. */ @@ -174,8 +175,8 @@ public class DeepSpeechModel { } /** - * @brief Signal the end of an audio signal to an ongoing streaming - * inference, returns the STT result over the whole audio signal. + * @brief Compute the final decoding of an ongoing streaming inference and return + * the result. Signals the end of an ongoing streaming inference. * * @param ctx A streaming state pointer returned by createStream(). * @@ -188,13 +189,15 @@ public class DeepSpeechModel { } /** - * @brief Signal the end of an audio signal to an ongoing streaming - * inference, returns per-letter metadata. + * @brief Compute the final decoding of an ongoing streaming inference and return + * the results including metadata. Signals the end of an ongoing streaming + * inference. * * @param ctx A streaming state pointer returned by createStream(). - * @param num_results Number of candidate transcripts to return. + * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this. * - * @return Outputs a Metadata object of individual letters along with their timing information. + * @return Metadata struct containing multiple candidate transcripts. Each transcript + * has per-token metadata including timing information. * * @note This method will free the state pointer (@p ctx). */ diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java new file mode 100644 index 00000000..c02b39ad --- /dev/null +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java @@ -0,0 +1,96 @@ +/* ---------------------------------------------------------------------------- + * This file was automatically generated by SWIG (http://www.swig.org). + * Version 4.0.1 + * + * Do not make changes to this file unless you know what you are doing--modify + * the SWIG interface file instead. + * ----------------------------------------------------------------------------- */ + +package org.mozilla.deepspeech.libdeepspeech; + +/** + * A single transcript computed by the model, including a confidence value and + * the metadata for its constituent tokens. + */ +public class CandidateTranscript { + private transient long swigCPtr; + protected transient boolean swigCMemOwn; + + protected CandidateTranscript(long cPtr, boolean cMemoryOwn) { + swigCMemOwn = cMemoryOwn; + swigCPtr = cPtr; + } + + protected static long getCPtr(CandidateTranscript obj) { + return (obj == null) ? 0 : obj.swigCPtr; + } + + public synchronized void delete() { + if (swigCPtr != 0) { + if (swigCMemOwn) { + swigCMemOwn = false; + throw new UnsupportedOperationException("C++ destructor does not have public access"); + } + swigCPtr = 0; + } + } + + /** + * Array of TokenMetadata objects + */ + public void setTokens(TokenMetadata value) { + implJNI.CandidateTranscript_tokens_set(swigCPtr, this, TokenMetadata.getCPtr(value), value); + } + + /** + * Array of TokenMetadata objects + */ + public TokenMetadata getTokens() { + long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this); + return (cPtr == 0) ? null : new TokenMetadata(cPtr, false); + } + + /** + * Size of the tokens array + */ + public void setNum_tokens(int value) { + implJNI.CandidateTranscript_num_tokens_set(swigCPtr, this, value); + } + + /** + * Size of the tokens array + */ + public int getNum_tokens() { + return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this); + } + + /** + * Approximated confidence value for this transcript. This is roughly the + * sum of the acoustic model logit values for each timestep/character that + * contributed to the creation of this transcript. + */ + public void setConfidence(double value) { + implJNI.CandidateTranscript_confidence_set(swigCPtr, this, value); + } + + /** + * Approximated confidence value for this transcript. This is roughly the + * sum of the acoustic model logit values for each timestep/character that + * contributed to the creation of this transcript. + */ + public double getConfidence() { + return implJNI.CandidateTranscript_confidence_get(swigCPtr, this); + } + + /** + * Retrieve one TokenMetadata element + * + * @param i Array index of the TokenMetadata to get + * + * @return The TokenMetadata requested or null + */ + public TokenMetadata getToken(int i) { + return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), true); + } + +} diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java index 482b7c58..bb9b0773 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java @@ -1,6 +1,6 @@ /* ---------------------------------------------------------------------------- * This file was automatically generated by SWIG (http://www.swig.org). - * Version 4.0.2 + * Version 4.0.1 * * Do not make changes to this file unless you know what you are doing--modify * the SWIG interface file instead. @@ -9,7 +9,7 @@ package org.mozilla.deepspeech.libdeepspeech; /** - * Stores the entire CTC output as an array of character metadata objects + * An array of CandidateTranscript objects computed by the model. */ public class Metadata { private transient long swigCPtr; @@ -40,61 +40,43 @@ public class Metadata { } /** - * List of items + * Array of CandidateTranscript objects */ - public void setItems(MetadataItem value) { - implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value); + public void setTranscripts(CandidateTranscript value) { + implJNI.Metadata_transcripts_set(swigCPtr, this, CandidateTranscript.getCPtr(value), value); } /** - * List of items + * Array of CandidateTranscript objects */ - public MetadataItem getItems() { - long cPtr = implJNI.Metadata_items_get(swigCPtr, this); - return (cPtr == 0) ? null : new MetadataItem(cPtr, false); + public CandidateTranscript getTranscripts() { + long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this); + return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false); } /** - * Size of the list of items + * Size of the transcripts array */ - public void setNum_items(int value) { - implJNI.Metadata_num_items_set(swigCPtr, this, value); + public void setNum_transcripts(int value) { + implJNI.Metadata_num_transcripts_set(swigCPtr, this, value); } /** - * Size of the list of items + * Size of the transcripts array */ - public int getNum_items() { - return implJNI.Metadata_num_items_get(swigCPtr, this); + public int getNum_transcripts() { + return implJNI.Metadata_num_transcripts_get(swigCPtr, this); } /** - * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
- * contributed to the creation of this transcription. + * Retrieve one CandidateTranscript element + * + * @param i Array index of the CandidateTranscript to get + * + * @return The CandidateTranscript requested or null */ - public void setConfidence(double value) { - implJNI.Metadata_confidence_set(swigCPtr, this, value); - } - - /** - * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
- * contributed to the creation of this transcription. - */ - public double getConfidence() { - return implJNI.Metadata_confidence_get(swigCPtr, this); - } - - /** - * Retrieve one MetadataItem element
- *
- * @param i Array index of the MetadataItem to get
- *
- * @return The MetadataItem requested or null - */ - public MetadataItem getItem(int i) { - return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true); + public CandidateTranscript getTranscript(int i) { + return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), true); } } diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java new file mode 100644 index 00000000..32246f1a --- /dev/null +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------------- + * This file was automatically generated by SWIG (http://www.swig.org). + * Version 4.0.1 + * + * Do not make changes to this file unless you know what you are doing--modify + * the SWIG interface file instead. + * ----------------------------------------------------------------------------- */ + +package org.mozilla.deepspeech.libdeepspeech; + +/** + * Stores text of an individual token, along with its timing information + */ +public class TokenMetadata { + private transient long swigCPtr; + protected transient boolean swigCMemOwn; + + protected TokenMetadata(long cPtr, boolean cMemoryOwn) { + swigCMemOwn = cMemoryOwn; + swigCPtr = cPtr; + } + + protected static long getCPtr(TokenMetadata obj) { + return (obj == null) ? 0 : obj.swigCPtr; + } + + public synchronized void delete() { + if (swigCPtr != 0) { + if (swigCMemOwn) { + swigCMemOwn = false; + throw new UnsupportedOperationException("C++ destructor does not have public access"); + } + swigCPtr = 0; + } + } + + /** + * The text corresponding to this token + */ + public void setText(String value) { + implJNI.TokenMetadata_text_set(swigCPtr, this, value); + } + + /** + * The text corresponding to this token + */ + public String getText() { + return implJNI.TokenMetadata_text_get(swigCPtr, this); + } + + /** + * Position of the token in units of 20ms + */ + public void setTimestep(int value) { + implJNI.TokenMetadata_timestep_set(swigCPtr, this, value); + } + + /** + * Position of the token in units of 20ms + */ + public int getTimestep() { + return implJNI.TokenMetadata_timestep_get(swigCPtr, this); + } + + /** + * Position of the token in seconds + */ + public void setStart_time(float value) { + implJNI.TokenMetadata_start_time_set(swigCPtr, this, value); + } + + /** + * Position of the token in seconds + */ + public float getStart_time() { + return implJNI.TokenMetadata_start_time_get(swigCPtr, this); + } + +} diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js index 7a027bde..6ce06c0d 100644 --- a/native_client/javascript/index.js +++ b/native_client/javascript/index.js @@ -115,12 +115,12 @@ Model.prototype.stt = function(aBuffer) { } /** - * Use the DeepSpeech model to perform Speech-To-Text and output metadata - * about the results. + * Use the DeepSpeech model to perform Speech-To-Text and output results including metadata. * * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). + * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified. * - * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. + * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. */ Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) { aNumResults = aNumResults || 1; @@ -173,9 +173,11 @@ Stream.prototype.intermediateDecode = function() { } /** - * Compute the intermediate decoding of an ongoing streaming inference. + * Compute the intermediate decoding of an ongoing streaming inference, return results including metadata. * - * @return {string} The STT intermediate result. + * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified. + * + * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. */ Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) { aNumResults = aNumResults || 1; @@ -183,7 +185,7 @@ Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) { } /** - * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal. + * Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference. * * @return {string} The STT result. * @@ -196,7 +198,9 @@ Stream.prototype.finishStream = function() { } /** - * Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata. + * Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference. + * + * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified. * * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. * @@ -253,48 +257,49 @@ function Version() { /** * @class * - * Stores each individual character, along with its timing information + * Stores text of an individual token, along with its timing information */ function TokenMetadata() {} /** - * The character generated for transcription + * The text corresponding to this token * - * @return {string} The character generated + * @return {string} The text generated */ TokenMetadata.prototype.text = function() {} /** - * Position of the character in units of 20ms + * Position of the token in units of 20ms * - * @return {int} The position of the character + * @return {int} The position of the token */ TokenMetadata.prototype.timestep = function() {}; /** - * Position of the character in seconds + * Position of the token in seconds * - * @return {float} The position of the character + * @return {float} The position of the token */ TokenMetadata.prototype.start_time = function() {}; /** * @class * - * Stores the entire CTC output as an array of character metadata objects + * A single transcript computed by the model, including a confidence value and + * the metadata for its constituent tokens. */ function CandidateTranscript () {} /** - * List of items + * Array of tokens * - * @return {array} List of :js:func:`TokenMetadata` + * @return {array} Array of :js:func:`TokenMetadata` */ -CandidateTranscript.prototype.items = function() {} +CandidateTranscript.prototype.tokens = function() {} /** * Approximated confidence value for this transcription. This is roughly the - * sum of the acoustic model logit values for each timestep/character that + * sum of the acoustic model logit values for each timestep/token that * contributed to the creation of this transcription. * * @return {float} Confidence value @@ -304,14 +309,14 @@ CandidateTranscript.prototype.confidence = function() {} /** * @class * - * Stores the entire CTC output as an array of character metadata objects + * An array of CandidateTranscript objects computed by the model. */ function Metadata () {} /** - * List of items + * Array of transcripts * - * @return {array} List of :js:func:`CandidateTranscript` objects + * @return {array} Array of :js:func:`CandidateTranscript` objects */ Metadata.prototype.transcripts = function() {} diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py index 5d9072ec..a44cf05f 100644 --- a/native_client/python/__init__.py +++ b/native_client/python/__init__.py @@ -123,15 +123,15 @@ class Model(object): def sttWithMetadata(self, audio_buffer, num_results=1): """ - Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results. + Use the DeepSpeech model to perform Speech-To-Text and return results including metadata. :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). :type audio_buffer: numpy.int16 array - :param num_results: Number of candidate transcripts to return. + :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this. :type num_results: int - :return: Outputs a struct of individual letters along with their timing information. + :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. :type: :func:`Metadata` """ return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results) @@ -192,10 +192,13 @@ class Stream(object): def intermediateDecodeWithMetadata(self, num_results=1): """ - Compute the intermediate decoding of an ongoing streaming inference. + Compute the intermediate decoding of an ongoing streaming inference and return results including metadata. - :return: The STT intermediate result. - :type: str + :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this. + :type num_results: int + + :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. + :type: :func:`Metadata` :throws: RuntimeError if the stream object is not valid """ @@ -205,8 +208,9 @@ class Stream(object): def finishStream(self): """ - Signal the end of an audio signal to an ongoing streaming inference, - returns the STT result over the whole audio signal. + Compute the final decoding of an ongoing streaming inference and return + the result. Signals the end of an ongoing streaming inference. The underlying + stream object must not be used after this method is called. :return: The STT result. :type: str @@ -221,13 +225,15 @@ class Stream(object): def finishStreamWithMetadata(self, num_results=1): """ - Signal the end of an audio signal to an ongoing streaming inference, - returns per-letter metadata. + Compute the final decoding of an ongoing streaming inference and return + results including metadata. Signals the end of an ongoing streaming + inference. The underlying stream object must not be used after this + method is called. - :param num_results: Number of candidate transcripts to return. + :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this. :type num_results: int - :return: Outputs a struct of individual letters along with their timing information. + :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. :type: :func:`Metadata` :throws: RuntimeError if the stream object is not valid