diff --git a/doc/C-API.rst b/doc/C-API.rst
index 2506d9b2..2b0e7e05 100644
--- a/doc/C-API.rst
+++ b/doc/C-API.rst
@@ -34,6 +34,9 @@ C
.. doxygenfunction:: DS_IntermediateDecode
:project: deepspeech-c
+.. doxygenfunction:: DS_IntermediateDecodeWithMetadata
+ :project: deepspeech-c
+
.. doxygenfunction:: DS_FinishStream
:project: deepspeech-c
diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst
index 2ba3415f..d43c7afb 100644
--- a/doc/DotNet-API.rst
+++ b/doc/DotNet-API.rst
@@ -31,13 +31,20 @@ ErrorCodes
Metadata
--------
-.. doxygenstruct:: DeepSpeechClient::Structs::Metadata
+.. doxygenstruct:: DeepSpeechClient::Models::Metadata
:project: deepspeech-dotnet
- :members: items, num_items, confidence
+ :members: Transcripts
-MetadataItem
-------------
+CandidateTranscript
+-------------------
-.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem
+.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
:project: deepspeech-dotnet
- :members: character, timestep, start_time
+ :members: Tokens, Confidence
+
+TokenMetadata
+-------------
+
+.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
+ :project: deepspeech-dotnet
+ :members: Text, Timestep, StartTime
diff --git a/doc/NodeJS-API.rst b/doc/NodeJS-API.rst
index aaba718c..b6170b5b 100644
--- a/doc/NodeJS-API.rst
+++ b/doc/NodeJS-API.rst
@@ -30,8 +30,14 @@ Metadata
.. js:autoclass:: Metadata
:members:
-MetadataItem
-------------
+CandidateTranscript
+-------------------
-.. js:autoclass:: MetadataItem
+.. js:autoclass:: CandidateTranscript
+ :members:
+
+TokenMetadata
+-------------
+
+.. js:autoclass:: TokenMetadata
:members:
diff --git a/doc/Python-API.rst b/doc/Python-API.rst
index b2b3567f..9aec57f0 100644
--- a/doc/Python-API.rst
+++ b/doc/Python-API.rst
@@ -21,8 +21,14 @@ Metadata
.. autoclass:: Metadata
:members:
-MetadataItem
-------------
+CandidateTranscript
+-------------------
-.. autoclass:: MetadataItem
+.. autoclass:: CandidateTranscript
+ :members:
+
+TokenMetadata
+-------------
+
+.. autoclass:: TokenMetadata
:members:
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index 8bfee073..bf4c0f00 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -42,20 +42,20 @@ typedef struct CandidateTranscript {
TokenMetadata* tokens;
/** Size of the tokens array */
int num_tokens;
- /** Approximated confidence value for this transcription. This is roughly the
+ /** Approximated confidence value for this transcript. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
- * contributed to the creation of this transcription.
+ * contributed to the creation of this transcript.
*/
double confidence;
} CandidateTranscript;
/**
- * @brief An array of CandidateTranscript objects computed by the model
+ * @brief An array of CandidateTranscript objects computed by the model.
*/
typedef struct Metadata {
/** Array of CandidateTranscript objects */
CandidateTranscript* transcripts;
- /** Size of the transcriptions array */
+ /** Size of the transcripts array */
int num_transcripts;
} Metadata;
@@ -191,14 +191,14 @@ char* DS_SpeechToText(ModelState* aCtx,
unsigned int aBufferSize);
/**
- * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
- * about the results.
+ * @brief Use the DeepSpeech model to perform Speech-To-Text and output results
+ * including metadata.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
- * @param aNumResults The number of candidate transcripts to return.
+ * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
*
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information. The user is
@@ -252,7 +252,7 @@ char* DS_IntermediateDecode(const StreamingState* aSctx);
/**
* @brief Compute the intermediate decoding of an ongoing streaming inference,
- * returns per-letter metadata.
+ * return results including metadata.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aNumResults The number of candidate transcripts to return.
@@ -267,8 +267,8 @@ Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
unsigned int aNumResults);
/**
- * @brief Signal the end of an audio signal to an ongoing streaming
- * inference, returns the STT result over the whole audio signal.
+ * @brief Compute the final decoding of an ongoing streaming inference and return
+ * the result. Signals the end of an ongoing streaming inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
@@ -281,8 +281,9 @@ DEEPSPEECH_EXPORT
char* DS_FinishStream(StreamingState* aSctx);
/**
- * @brief Signal the end of an audio signal to an ongoing streaming
- * inference, returns per-letter metadata.
+ * @brief Compute the final decoding of an ongoing streaming inference and return
+ * results including metadata. Signals the end of an ongoing streaming
+ * inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aNumResults The number of candidate transcripts to return.
@@ -295,7 +296,7 @@ char* DS_FinishStream(StreamingState* aSctx);
* @note This method will free the state pointer (@p aSctx).
*/
DEEPSPEECH_EXPORT
-Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
+Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
unsigned int aNumResults);
/**
diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
index ce184cf4..3340c9b3 100644
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@@ -199,10 +199,10 @@ namespace DeepSpeechClient
}
///
- /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+ /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
///
/// Instance of the stream to finish.
- /// Number of candidate transcripts to return.
+ /// Maximum number of candidate transcripts to return. Returned list might be smaller than this.
/// The extended metadata result.
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
{
@@ -220,10 +220,10 @@ namespace DeepSpeechClient
}
///
- /// Computes the intermediate decoding of an ongoing streaming inference.
+ /// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
///
/// Instance of the stream to decode.
- /// Number of candidate transcripts to return.
+ /// Maximum number of candidate transcripts to return. Returned list might be smaller than this.
/// The STT intermediate result.
public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
{
@@ -273,11 +273,11 @@ namespace DeepSpeechClient
}
///
- /// Use the DeepSpeech model to perform Speech-To-Text.
+ /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
///
/// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
- /// Number of candidate transcripts to return.
+ /// Maximum number of candidate transcripts to return. Returned list might be smaller than this.
/// The extended metadata. Returns NULL on error.
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
{
diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
index ae3e72cf..37d6ce59 100644
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@@ -68,11 +68,11 @@ namespace DeepSpeechClient.Interfaces
uint aBufferSize);
///
- /// Use the DeepSpeech model to perform Speech-To-Text.
+ /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
///
/// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
- /// Number of candidate transcripts to return.
+ /// Maximum number of candidate transcripts to return. Returned list might be smaller than this.
/// The extended metadata. Returns NULL on error.
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
uint aBufferSize,
@@ -105,10 +105,10 @@ namespace DeepSpeechClient.Interfaces
unsafe string IntermediateDecode(DeepSpeechStream stream);
///
- /// Computes the intermediate decoding of an ongoing streaming inference.
+ /// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
///
/// Instance of the stream to decode.
- /// Number of candidate transcripts to return.
+ /// Maximum number of candidate transcripts to return. Returned list might be smaller than this.
/// The extended metadata result.
unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
@@ -120,10 +120,10 @@ namespace DeepSpeechClient.Interfaces
unsafe string FinishStream(DeepSpeechStream stream);
///
- /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+ /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
///
/// Instance of the stream to finish.
- /// Number of candidate transcripts to return.
+ /// Maximum number of candidate transcripts to return. Returned list might be smaller than this.
/// The extended metadata result.
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
}
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
index b506b1d3..a5b339b3 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
@@ -117,9 +117,10 @@ public class DeepSpeechModel {
* @param buffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in the audio signal.
- * @param num_results Number of candidate transcripts to return.
+ * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
*
- * @return Outputs a Metadata object of individual letters along with their timing information.
+ * @return Metadata struct containing multiple candidate transcripts. Each transcript
+ * has per-token metadata including timing information.
*/
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
@@ -165,7 +166,7 @@ public class DeepSpeechModel {
* @brief Compute the intermediate decoding of an ongoing streaming inference.
*
* @param ctx A streaming state pointer returned by createStream().
- * @param num_results Number of candidate transcripts to return.
+ * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
*
* @return The STT intermediate result.
*/
@@ -174,8 +175,8 @@ public class DeepSpeechModel {
}
/**
- * @brief Signal the end of an audio signal to an ongoing streaming
- * inference, returns the STT result over the whole audio signal.
+ * @brief Compute the final decoding of an ongoing streaming inference and return
+ * the result. Signals the end of an ongoing streaming inference.
*
* @param ctx A streaming state pointer returned by createStream().
*
@@ -188,13 +189,15 @@ public class DeepSpeechModel {
}
/**
- * @brief Signal the end of an audio signal to an ongoing streaming
- * inference, returns per-letter metadata.
+ * @brief Compute the final decoding of an ongoing streaming inference and return
+ * the results including metadata. Signals the end of an ongoing streaming
+ * inference.
*
* @param ctx A streaming state pointer returned by createStream().
- * @param num_results Number of candidate transcripts to return.
+ * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
*
- * @return Outputs a Metadata object of individual letters along with their timing information.
+ * @return Metadata struct containing multiple candidate transcripts. Each transcript
+ * has per-token metadata including timing information.
*
* @note This method will free the state pointer (@p ctx).
*/
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
new file mode 100644
index 00000000..c02b39ad
--- /dev/null
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 4.0.1
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+package org.mozilla.deepspeech.libdeepspeech;
+
+/**
+ * A single transcript computed by the model, including a confidence value and
+ * the metadata for its constituent tokens.
+ */
+public class CandidateTranscript {
+ private transient long swigCPtr;
+ protected transient boolean swigCMemOwn;
+
+ protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
+ swigCMemOwn = cMemoryOwn;
+ swigCPtr = cPtr;
+ }
+
+ protected static long getCPtr(CandidateTranscript obj) {
+ return (obj == null) ? 0 : obj.swigCPtr;
+ }
+
+ public synchronized void delete() {
+ if (swigCPtr != 0) {
+ if (swigCMemOwn) {
+ swigCMemOwn = false;
+ throw new UnsupportedOperationException("C++ destructor does not have public access");
+ }
+ swigCPtr = 0;
+ }
+ }
+
+ /**
+ * Array of TokenMetadata objects
+ */
+ public void setTokens(TokenMetadata value) {
+ implJNI.CandidateTranscript_tokens_set(swigCPtr, this, TokenMetadata.getCPtr(value), value);
+ }
+
+ /**
+ * Array of TokenMetadata objects
+ */
+ public TokenMetadata getTokens() {
+ long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
+ return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
+ }
+
+ /**
+ * Size of the tokens array
+ */
+ public void setNum_tokens(int value) {
+ implJNI.CandidateTranscript_num_tokens_set(swigCPtr, this, value);
+ }
+
+ /**
+ * Size of the tokens array
+ */
+ public int getNum_tokens() {
+ return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
+ }
+
+ /**
+ * Approximated confidence value for this transcript. This is roughly the
+ * sum of the acoustic model logit values for each timestep/character that
+ * contributed to the creation of this transcript.
+ */
+ public void setConfidence(double value) {
+ implJNI.CandidateTranscript_confidence_set(swigCPtr, this, value);
+ }
+
+ /**
+ * Approximated confidence value for this transcript. This is roughly the
+ * sum of the acoustic model logit values for each timestep/character that
+ * contributed to the creation of this transcript.
+ */
+ public double getConfidence() {
+ return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
+ }
+
+ /**
+ * Retrieve one TokenMetadata element
+ *
+ * @param i Array index of the TokenMetadata to get
+ *
+ * @return The TokenMetadata requested or null
+ */
+ public TokenMetadata getToken(int i) {
+ return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), true);
+ }
+
+}
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
index 482b7c58..bb9b0773 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
@@ -1,6 +1,6 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
- * Version 4.0.2
+ * Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
@@ -9,7 +9,7 @@
package org.mozilla.deepspeech.libdeepspeech;
/**
- * Stores the entire CTC output as an array of character metadata objects
+ * An array of CandidateTranscript objects computed by the model.
*/
public class Metadata {
private transient long swigCPtr;
@@ -40,61 +40,43 @@ public class Metadata {
}
/**
- * List of items
+ * Array of CandidateTranscript objects
*/
- public void setItems(MetadataItem value) {
- implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
+ public void setTranscripts(CandidateTranscript value) {
+ implJNI.Metadata_transcripts_set(swigCPtr, this, CandidateTranscript.getCPtr(value), value);
}
/**
- * List of items
+ * Array of CandidateTranscript objects
*/
- public MetadataItem getItems() {
- long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
- return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
+ public CandidateTranscript getTranscripts() {
+ long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
+ return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
}
/**
- * Size of the list of items
+ * Size of the transcripts array
*/
- public void setNum_items(int value) {
- implJNI.Metadata_num_items_set(swigCPtr, this, value);
+ public void setNum_transcripts(int value) {
+ implJNI.Metadata_num_transcripts_set(swigCPtr, this, value);
}
/**
- * Size of the list of items
+ * Size of the transcripts array
*/
- public int getNum_items() {
- return implJNI.Metadata_num_items_get(swigCPtr, this);
+ public int getNum_transcripts() {
+ return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
}
/**
- * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
- * contributed to the creation of this transcription.
+ * Retrieve one CandidateTranscript element
+ *
+ * @param i Array index of the CandidateTranscript to get
+ *
+ * @return The CandidateTranscript requested or null
*/
- public void setConfidence(double value) {
- implJNI.Metadata_confidence_set(swigCPtr, this, value);
- }
-
- /**
- * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
- * contributed to the creation of this transcription.
- */
- public double getConfidence() {
- return implJNI.Metadata_confidence_get(swigCPtr, this);
- }
-
- /**
- * Retrieve one MetadataItem element
- *
- * @param i Array index of the MetadataItem to get
- *
- * @return The MetadataItem requested or null
- */
- public MetadataItem getItem(int i) {
- return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
+ public CandidateTranscript getTranscript(int i) {
+ return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), true);
}
}
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
new file mode 100644
index 00000000..32246f1a
--- /dev/null
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 4.0.1
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+package org.mozilla.deepspeech.libdeepspeech;
+
+/**
+ * Stores text of an individual token, along with its timing information
+ */
+public class TokenMetadata {
+ private transient long swigCPtr;
+ protected transient boolean swigCMemOwn;
+
+ protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
+ swigCMemOwn = cMemoryOwn;
+ swigCPtr = cPtr;
+ }
+
+ protected static long getCPtr(TokenMetadata obj) {
+ return (obj == null) ? 0 : obj.swigCPtr;
+ }
+
+ public synchronized void delete() {
+ if (swigCPtr != 0) {
+ if (swigCMemOwn) {
+ swigCMemOwn = false;
+ throw new UnsupportedOperationException("C++ destructor does not have public access");
+ }
+ swigCPtr = 0;
+ }
+ }
+
+ /**
+ * The text corresponding to this token
+ */
+ public void setText(String value) {
+ implJNI.TokenMetadata_text_set(swigCPtr, this, value);
+ }
+
+ /**
+ * The text corresponding to this token
+ */
+ public String getText() {
+ return implJNI.TokenMetadata_text_get(swigCPtr, this);
+ }
+
+ /**
+ * Position of the token in units of 20ms
+ */
+ public void setTimestep(int value) {
+ implJNI.TokenMetadata_timestep_set(swigCPtr, this, value);
+ }
+
+ /**
+ * Position of the token in units of 20ms
+ */
+ public int getTimestep() {
+ return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
+ }
+
+ /**
+ * Position of the token in seconds
+ */
+ public void setStart_time(float value) {
+ implJNI.TokenMetadata_start_time_set(swigCPtr, this, value);
+ }
+
+ /**
+ * Position of the token in seconds
+ */
+ public float getStart_time() {
+ return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
+ }
+
+}
diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js
index 7a027bde..6ce06c0d 100644
--- a/native_client/javascript/index.js
+++ b/native_client/javascript/index.js
@@ -115,12 +115,12 @@ Model.prototype.stt = function(aBuffer) {
}
/**
- * Use the DeepSpeech model to perform Speech-To-Text and output metadata
- * about the results.
+ * Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
*
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
*
- * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
+ * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
aNumResults = aNumResults || 1;
@@ -173,9 +173,11 @@ Stream.prototype.intermediateDecode = function() {
}
/**
- * Compute the intermediate decoding of an ongoing streaming inference.
+ * Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
*
- * @return {string} The STT intermediate result.
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
+ *
+ * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
aNumResults = aNumResults || 1;
@@ -183,7 +185,7 @@ Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
}
/**
- * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
+ * Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
*
* @return {string} The STT result.
*
@@ -196,7 +198,9 @@ Stream.prototype.finishStream = function() {
}
/**
- * Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
+ * Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
+ *
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
*
@@ -253,48 +257,49 @@ function Version() {
/**
* @class
*
- * Stores each individual character, along with its timing information
+ * Stores text of an individual token, along with its timing information
*/
function TokenMetadata() {}
/**
- * The character generated for transcription
+ * The text corresponding to this token
*
- * @return {string} The character generated
+ * @return {string} The text generated
*/
TokenMetadata.prototype.text = function() {}
/**
- * Position of the character in units of 20ms
+ * Position of the token in units of 20ms
*
- * @return {int} The position of the character
+ * @return {int} The position of the token
*/
TokenMetadata.prototype.timestep = function() {};
/**
- * Position of the character in seconds
+ * Position of the token in seconds
*
- * @return {float} The position of the character
+ * @return {float} The position of the token
*/
TokenMetadata.prototype.start_time = function() {};
/**
* @class
*
- * Stores the entire CTC output as an array of character metadata objects
+ * A single transcript computed by the model, including a confidence value and
+ * the metadata for its constituent tokens.
*/
function CandidateTranscript () {}
/**
- * List of items
+ * Array of tokens
*
- * @return {array} List of :js:func:`TokenMetadata`
+ * @return {array} Array of :js:func:`TokenMetadata`
*/
-CandidateTranscript.prototype.items = function() {}
+CandidateTranscript.prototype.tokens = function() {}
/**
* Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
+ * sum of the acoustic model logit values for each timestep/token that
* contributed to the creation of this transcription.
*
* @return {float} Confidence value
@@ -304,14 +309,14 @@ CandidateTranscript.prototype.confidence = function() {}
/**
* @class
*
- * Stores the entire CTC output as an array of character metadata objects
+ * An array of CandidateTranscript objects computed by the model.
*/
function Metadata () {}
/**
- * List of items
+ * Array of transcripts
*
- * @return {array} List of :js:func:`CandidateTranscript` objects
+ * @return {array} Array of :js:func:`CandidateTranscript` objects
*/
Metadata.prototype.transcripts = function() {}
diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py
index 5d9072ec..a44cf05f 100644
--- a/native_client/python/__init__.py
+++ b/native_client/python/__init__.py
@@ -123,15 +123,15 @@ class Model(object):
def sttWithMetadata(self, audio_buffer, num_results=1):
"""
- Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
+ Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type audio_buffer: numpy.int16 array
- :param num_results: Number of candidate transcripts to return.
+ :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
- :return: Outputs a struct of individual letters along with their timing information.
+ :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata`
"""
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
@@ -192,10 +192,13 @@ class Stream(object):
def intermediateDecodeWithMetadata(self, num_results=1):
"""
- Compute the intermediate decoding of an ongoing streaming inference.
+ Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.
- :return: The STT intermediate result.
- :type: str
+ :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
+ :type num_results: int
+
+ :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
+ :type: :func:`Metadata`
:throws: RuntimeError if the stream object is not valid
"""
@@ -205,8 +208,9 @@ class Stream(object):
def finishStream(self):
"""
- Signal the end of an audio signal to an ongoing streaming inference,
- returns the STT result over the whole audio signal.
+ Compute the final decoding of an ongoing streaming inference and return
+ the result. Signals the end of an ongoing streaming inference. The underlying
+ stream object must not be used after this method is called.
:return: The STT result.
:type: str
@@ -221,13 +225,15 @@ class Stream(object):
def finishStreamWithMetadata(self, num_results=1):
"""
- Signal the end of an audio signal to an ongoing streaming inference,
- returns per-letter metadata.
+ Compute the final decoding of an ongoing streaming inference and return
+ results including metadata. Signals the end of an ongoing streaming
+ inference. The underlying stream object must not be used after this
+ method is called.
- :param num_results: Number of candidate transcripts to return.
+ :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
- :return: Outputs a struct of individual letters along with their timing information.
+ :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata`
:throws: RuntimeError if the stream object is not valid