From e9ae38bf4789b9a2f62520c622c1eba1af656a9c Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 25 Feb 2020 15:43:36 +0100
Subject: [PATCH] Update docs

---
 doc/C-API.rst                                 |  3 +
 doc/DotNet-API.rst                            | 19 ++--
 doc/NodeJS-API.rst                            | 12 ++-
 doc/Python-API.rst                            | 12 ++-
 native_client/deepspeech.h                    | 27 +++---
 .../dotnet/DeepSpeechClient/DeepSpeech.cs     | 12 +--
 .../Interfaces/IDeepSpeech.cs                 | 12 +--
 .../libdeepspeech/DeepSpeechModel.java        | 21 ++--
 .../CandidateTranscript.java                  | 96 +++++++++++++++++++
 .../libdeepspeech_doc/Metadata.java           | 62 +++++-------
 .../libdeepspeech_doc/TokenMetadata.java      | 79 +++++++++++++++
 native_client/javascript/index.js             | 49 +++++-----
 native_client/python/__init__.py              | 30 +++---
 13 files changed, 314 insertions(+), 120 deletions(-)
 create mode 100644 native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
 create mode 100644 native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java

diff --git a/doc/C-API.rst b/doc/C-API.rst
index 2506d9b2..2b0e7e05 100644
--- a/doc/C-API.rst
+++ b/doc/C-API.rst
@@ -34,6 +34,9 @@ C
 .. doxygenfunction:: DS_IntermediateDecode
    :project: deepspeech-c
 
+.. doxygenfunction:: DS_IntermediateDecodeWithMetadata
+   :project: deepspeech-c
+
 .. doxygenfunction:: DS_FinishStream
    :project: deepspeech-c
 
diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst
index 2ba3415f..d43c7afb 100644
--- a/doc/DotNet-API.rst
+++ b/doc/DotNet-API.rst
@@ -31,13 +31,20 @@ ErrorCodes
 Metadata
 --------
 
-.. doxygenstruct:: DeepSpeechClient::Structs::Metadata
+.. doxygenstruct:: DeepSpeechClient::Models::Metadata
    :project: deepspeech-dotnet
-   :members: items, num_items, confidence
+   :members: Transcripts
 
-MetadataItem
-------------
+CandidateTranscript
+-------------------
 
-.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem
+.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
    :project: deepspeech-dotnet
-   :members: character, timestep, start_time
+   :members: Tokens, Confidence
+
+TokenMetadata
+-------------
+
+.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
+   :project: deepspeech-dotnet
+   :members: Text, Timestep, StartTime
diff --git a/doc/NodeJS-API.rst b/doc/NodeJS-API.rst
index aaba718c..b6170b5b 100644
--- a/doc/NodeJS-API.rst
+++ b/doc/NodeJS-API.rst
@@ -30,8 +30,14 @@ Metadata
 .. js:autoclass:: Metadata
    :members:
 
-MetadataItem
-------------
+CandidateTranscript
+-------------------
 
-.. js:autoclass:: MetadataItem
+.. js:autoclass:: CandidateTranscript
+   :members:
+
+TokenMetadata
+-------------
+
+.. js:autoclass:: TokenMetadata
    :members:
diff --git a/doc/Python-API.rst b/doc/Python-API.rst
index b2b3567f..9aec57f0 100644
--- a/doc/Python-API.rst
+++ b/doc/Python-API.rst
@@ -21,8 +21,14 @@ Metadata
 .. autoclass:: Metadata
    :members:
 
-MetadataItem
-------------
+CandidateTranscript
+-------------------
 
-.. autoclass:: MetadataItem
+.. autoclass:: CandidateTranscript
+   :members:
+
+TokenMetadata
+-------------
+
+.. autoclass:: TokenMetadata
    :members:
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index 8bfee073..bf4c0f00 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -42,20 +42,20 @@ typedef struct CandidateTranscript {
   TokenMetadata* tokens;
   /** Size of the tokens array */
   int num_tokens;
-  /** Approximated confidence value for this transcription. This is roughly the
+  /** Approximated confidence value for this transcript. This is roughly the
    * sum of the acoustic model logit values for each timestep/character that
-   * contributed to the creation of this transcription.
+   * contributed to the creation of this transcript.
    */
   double confidence;
 } CandidateTranscript;
 
 /**
- * @brief An array of CandidateTranscript objects computed by the model
+ * @brief An array of CandidateTranscript objects computed by the model.
  */
 typedef struct Metadata {
   /** Array of CandidateTranscript objects */
   CandidateTranscript* transcripts;
-  /** Size of the transcriptions array */
+  /** Size of the transcripts array */
   int num_transcripts;
 } Metadata;
 
@@ -191,14 +191,14 @@ char* DS_SpeechToText(ModelState* aCtx,
                       unsigned int aBufferSize);
 
 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata 
- * about the results.
+ * @brief Use the DeepSpeech model to perform Speech-To-Text and output results
+ * including metadata.
  *
  * @param aCtx The ModelState pointer for the model to use.
  * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
  *                sample rate (matching what the model was trained on).
  * @param aBufferSize The number of samples in the audio signal.
- * @param aNumResults The number of candidate transcripts to return.
+ * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
  *
  * @return Metadata struct containing multiple candidate transcripts. Each transcript
  *         has per-token metadata including timing information. The user is
@@ -252,7 +252,7 @@ char* DS_IntermediateDecode(const StreamingState* aSctx);
 
 /**
  * @brief Compute the intermediate decoding of an ongoing streaming inference,
- *        returns per-letter metadata.
+ *        return results including metadata.
  *
  * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
  * @param aNumResults The number of candidate transcripts to return.
@@ -267,8 +267,8 @@ Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
                                             unsigned int aNumResults);
 
 /**
- * @brief Signal the end of an audio signal to an ongoing streaming
- *        inference, returns the STT result over the whole audio signal.
+ * @brief Compute the final decoding of an ongoing streaming inference and return
+ *        the result. Signals the end of an ongoing streaming inference.
  *
  * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
  *
@@ -281,8 +281,9 @@ DEEPSPEECH_EXPORT
 char* DS_FinishStream(StreamingState* aSctx);
 
 /**
- * @brief Signal the end of an audio signal to an ongoing streaming
- *        inference, returns per-letter metadata.
+ * @brief Compute the final decoding of an ongoing streaming inference and return
+ *        results including metadata. Signals the end of an ongoing streaming
+ *        inference.
  *
  * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
  * @param aNumResults The number of candidate transcripts to return.
@@ -295,7 +296,7 @@ char* DS_FinishStream(StreamingState* aSctx);
  * @note This method will free the state pointer (@p aSctx).
  */
 DEEPSPEECH_EXPORT
-Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, 
+Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
                                       unsigned int aNumResults);
 
 /**
diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
index ce184cf4..3340c9b3 100644
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@@ -199,10 +199,10 @@ namespace DeepSpeechClient
         }
 
         /// <summary>
-        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
         /// </summary>
         /// <param name="stream">Instance of the stream to finish.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The extended metadata result.</returns>
         public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
         {
@@ -220,10 +220,10 @@ namespace DeepSpeechClient
         }
 
         /// <summary>
-        /// Computes the intermediate decoding of an ongoing streaming inference.
+        /// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
         /// </summary>
         /// <param name="stream">Instance of the stream to decode.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The STT intermediate result.</returns>
         public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
         {
@@ -273,11 +273,11 @@ namespace DeepSpeechClient
         }
 
         /// <summary>
-        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
         /// </summary>
         /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
         /// <param name="aBufferSize">The number of samples in the audio signal.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The extended metadata. Returns NULL on error.</returns>
         public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
         {
diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
index ae3e72cf..37d6ce59 100644
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@@ -68,11 +68,11 @@ namespace DeepSpeechClient.Interfaces
                 uint aBufferSize);
 
         /// <summary>
-        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
         /// </summary>
         /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
         /// <param name="aBufferSize">The number of samples in the audio signal.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The extended metadata. Returns NULL on error.</returns>
         unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
                 uint aBufferSize,
@@ -105,10 +105,10 @@ namespace DeepSpeechClient.Interfaces
         unsafe string IntermediateDecode(DeepSpeechStream stream);
 
         /// <summary>
-        /// Computes the intermediate decoding of an ongoing streaming inference.
+        /// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
         /// </summary>
         /// <param name="stream">Instance of the stream to decode.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The extended metadata result.</returns>
         unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
 
@@ -120,10 +120,10 @@ namespace DeepSpeechClient.Interfaces
         unsafe string FinishStream(DeepSpeechStream stream);
 
         /// <summary>
-        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
         /// </summary>
         /// <param name="stream">Instance of the stream to finish.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The extended metadata result.</returns>
         unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
     }
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
index b506b1d3..a5b339b3 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
@@ -117,9 +117,10 @@ public class DeepSpeechModel {
     * @param buffer A 16-bit, mono raw audio signal at the appropriate
     *                sample rate (matching what the model was trained on).
     * @param buffer_size The number of samples in the audio signal.
-    * @param num_results Number of candidate transcripts to return.
+    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
     *
-    * @return Outputs a Metadata object of individual letters along with their timing information.
+    * @return Metadata struct containing multiple candidate transcripts. Each transcript
+    *         has per-token metadata including timing information.
     */
     public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
         return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
@@ -165,7 +166,7 @@ public class DeepSpeechModel {
     * @brief Compute the intermediate decoding of an ongoing streaming inference.
     *
     * @param ctx A streaming state pointer returned by createStream().
-    * @param num_results Number of candidate transcripts to return.
+    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
     *
     * @return The STT intermediate result.
     */
@@ -174,8 +175,8 @@ public class DeepSpeechModel {
     }
 
    /**
-    * @brief Signal the end of an audio signal to an ongoing streaming
-    *        inference, returns the STT result over the whole audio signal.
+    * @brief Compute the final decoding of an ongoing streaming inference and return
+    *        the result. Signals the end of an ongoing streaming inference.
     *
     * @param ctx A streaming state pointer returned by createStream().
     *
@@ -188,13 +189,15 @@ public class DeepSpeechModel {
     }
 
    /**
-    * @brief Signal the end of an audio signal to an ongoing streaming
-    *        inference, returns per-letter metadata.
+    * @brief Compute the final decoding of an ongoing streaming inference and return
+    *        the results including metadata. Signals the end of an ongoing streaming
+    *        inference.
     *
     * @param ctx A streaming state pointer returned by createStream().
-    * @param num_results Number of candidate transcripts to return.
+    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
     *
-    * @return Outputs a Metadata object of individual letters along with their timing information.
+    * @return Metadata struct containing multiple candidate transcripts. Each transcript
+    *         has per-token metadata including timing information.
     *
     * @note This method will free the state pointer (@p ctx).
     */
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
new file mode 100644
index 00000000..c02b39ad
--- /dev/null
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 4.0.1
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+package org.mozilla.deepspeech.libdeepspeech;
+
+/**
+ * A single transcript computed by the model, including a confidence value and
+ * the metadata for its constituent tokens.
+ */
+public class CandidateTranscript {
+  private transient long swigCPtr;
+  protected transient boolean swigCMemOwn;
+
+  protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
+    swigCMemOwn = cMemoryOwn;
+    swigCPtr = cPtr;
+  }
+
+  protected static long getCPtr(CandidateTranscript obj) {
+    return (obj == null) ? 0 : obj.swigCPtr;
+  }
+
+  public synchronized void delete() {
+    if (swigCPtr != 0) {
+      if (swigCMemOwn) {
+        swigCMemOwn = false;
+        throw new UnsupportedOperationException("C++ destructor does not have public access");
+      }
+      swigCPtr = 0;
+    }
+  }
+
+  /**
+   * Array of TokenMetadata objects
+   */
+  public void setTokens(TokenMetadata value) {
+    implJNI.CandidateTranscript_tokens_set(swigCPtr, this, TokenMetadata.getCPtr(value), value);
+  }
+
+  /**
+   * Array of TokenMetadata objects
+   */
+  public TokenMetadata getTokens() {
+    long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
+    return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
+  }
+
+  /**
+   * Size of the tokens array
+   */
+  public void setNum_tokens(int value) {
+    implJNI.CandidateTranscript_num_tokens_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Size of the tokens array
+   */
+  public int getNum_tokens() {
+    return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
+  }
+
+  /**
+   * Approximated confidence value for this transcript. This is roughly the
+   * sum of the acoustic model logit values for each timestep/character that
+   * contributed to the creation of this transcript.
+   */
+  public void setConfidence(double value) {
+    implJNI.CandidateTranscript_confidence_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Approximated confidence value for this transcript. This is roughly the
+   * sum of the acoustic model logit values for each timestep/character that
+   * contributed to the creation of this transcript.
+   */
+  public double getConfidence() {
+    return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
+  }
+
+  /**
+   * Retrieve one TokenMetadata element
+   *
+   * @param i Array index of the TokenMetadata to get
+   *
+   * @return The TokenMetadata requested or null
+   */
+  public TokenMetadata getToken(int i) {
+    return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), true);
+  }
+
+}
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
index 482b7c58..bb9b0773 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 4.0.2
+ * Version 4.0.1
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
@@ -9,7 +9,7 @@
 package org.mozilla.deepspeech.libdeepspeech;
 
 /**
- * Stores the entire CTC output as an array of character metadata objects
+ * An array of CandidateTranscript objects computed by the model.
  */
 public class Metadata {
   private transient long swigCPtr;
@@ -40,61 +40,43 @@ public class Metadata {
   }
 
   /**
-   *  List of items 
+   * Array of CandidateTranscript objects
    */
-  public void setItems(MetadataItem value) {
-    implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
+  public void setTranscripts(CandidateTranscript value) {
+    implJNI.Metadata_transcripts_set(swigCPtr, this, CandidateTranscript.getCPtr(value), value);
   }
 
   /**
-   *  List of items 
+   * Array of CandidateTranscript objects
    */
-  public MetadataItem getItems() {
-    long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
-    return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
+  public CandidateTranscript getTranscripts() {
+    long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
+    return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
   }
 
   /**
-   *  Size of the list of items 
+   * Size of the transcripts array
    */
-  public void setNum_items(int value) {
-    implJNI.Metadata_num_items_set(swigCPtr, this, value);
+  public void setNum_transcripts(int value) {
+    implJNI.Metadata_num_transcripts_set(swigCPtr, this, value);
   }
 
   /**
-   *  Size of the list of items 
+   * Size of the transcripts array
    */
-  public int getNum_items() {
-    return implJNI.Metadata_num_items_get(swigCPtr, this);
+  public int getNum_transcripts() {
+    return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
   }
 
   /**
-   *  Approximated confidence value for this transcription. This is roughly the<br>
-   * sum of the acoustic model logit values for each timestep/character that<br>
-   * contributed to the creation of this transcription.
+   * Retrieve one CandidateTranscript element
+   *
+   * @param i Array index of the CandidateTranscript to get
+   *
+   * @return The CandidateTranscript requested or null
    */
-  public void setConfidence(double value) {
-    implJNI.Metadata_confidence_set(swigCPtr, this, value);
-  }
-
-  /**
-   *  Approximated confidence value for this transcription. This is roughly the<br>
-   * sum of the acoustic model logit values for each timestep/character that<br>
-   * contributed to the creation of this transcription.
-   */
-  public double getConfidence() {
-    return implJNI.Metadata_confidence_get(swigCPtr, this);
-  }
-
-  /**
-   * Retrieve one MetadataItem element<br>
-   * <br>
-   * @param i Array index of the MetadataItem to get<br>
-   * <br>
-   * @return The MetadataItem requested or null
-   */
-  public MetadataItem getItem(int i) {
-    return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
+  public CandidateTranscript getTranscript(int i) {
+    return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), true);
   }
 
 }
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
new file mode 100644
index 00000000..32246f1a
--- /dev/null
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 4.0.1
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+package org.mozilla.deepspeech.libdeepspeech;
+
+/**
+ * Stores text of an individual token, along with its timing information
+ */
+public class TokenMetadata {
+  private transient long swigCPtr;
+  protected transient boolean swigCMemOwn;
+
+  protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
+    swigCMemOwn = cMemoryOwn;
+    swigCPtr = cPtr;
+  }
+
+  protected static long getCPtr(TokenMetadata obj) {
+    return (obj == null) ? 0 : obj.swigCPtr;
+  }
+
+  public synchronized void delete() {
+    if (swigCPtr != 0) {
+      if (swigCMemOwn) {
+        swigCMemOwn = false;
+        throw new UnsupportedOperationException("C++ destructor does not have public access");
+      }
+      swigCPtr = 0;
+    }
+  }
+
+  /**
+   * The text corresponding to this token
+   */
+  public void setText(String value) {
+    implJNI.TokenMetadata_text_set(swigCPtr, this, value);
+  }
+
+  /**
+   * The text corresponding to this token
+   */
+  public String getText() {
+    return implJNI.TokenMetadata_text_get(swigCPtr, this);
+  }
+
+  /**
+   * Position of the token in units of 20ms
+   */
+  public void setTimestep(int value) {
+    implJNI.TokenMetadata_timestep_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Position of the token in units of 20ms
+   */
+  public int getTimestep() {
+    return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
+  }
+
+  /**
+   * Position of the token in seconds
+   */
+  public void setStart_time(float value) {
+    implJNI.TokenMetadata_start_time_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Position of the token in seconds
+   */
+  public float getStart_time() {
+    return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
+  }
+
+}
diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js
index 7a027bde..6ce06c0d 100644
--- a/native_client/javascript/index.js
+++ b/native_client/javascript/index.js
@@ -115,12 +115,12 @@ Model.prototype.stt = function(aBuffer) {
 }
 
 /**
- * Use the DeepSpeech model to perform Speech-To-Text and output metadata
- * about the results.
+ * Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
  *
  * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
  *
- * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
+ * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
  */
 Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
     aNumResults = aNumResults || 1;
@@ -173,9 +173,11 @@ Stream.prototype.intermediateDecode = function() {
 }
 
 /**
- * Compute the intermediate decoding of an ongoing streaming inference.
+ * Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
  *
- * @return {string} The STT intermediate result.
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
+ *
+ * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
  */
 Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
     aNumResults = aNumResults || 1;
@@ -183,7 +185,7 @@ Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
 }
 
 /**
- * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
+ * Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
  *
  * @return {string} The STT result.
  *
@@ -196,7 +198,9 @@ Stream.prototype.finishStream = function() {
 }
 
 /**
- * Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
+ * Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
+ *
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
  *
  * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
  *
@@ -253,48 +257,49 @@ function Version() {
 /**
  * @class
  * 
- * Stores each individual character, along with its timing information
+ * Stores text of an individual token, along with its timing information
  */
 function TokenMetadata() {}
 
 /** 
- * The character generated for transcription
+ * The text corresponding to this token
  *
- * @return {string} The character generated
+ * @return {string} The text generated
  */
 TokenMetadata.prototype.text = function() {}
 
 /**
- * Position of the character in units of 20ms
+ * Position of the token in units of 20ms
  *
- * @return {int} The position of the character
+ * @return {int} The position of the token
  */
 TokenMetadata.prototype.timestep = function() {};
 
 /**
- * Position of the character in seconds
+ * Position of the token in seconds
  *
- * @return {float} The position of the character
+ * @return {float} The position of the token
  */
 TokenMetadata.prototype.start_time = function() {};
 
 /**
  * @class
  *
- * Stores the entire CTC output as an array of character metadata objects
+ * A single transcript computed by the model, including a confidence value and
+ * the metadata for its constituent tokens.
  */
 function CandidateTranscript () {}
 
 /**
- * List of items
+ * Array of tokens
  *
- * @return {array} List of :js:func:`TokenMetadata`
+ * @return {array} Array of :js:func:`TokenMetadata`
  */
-CandidateTranscript.prototype.items = function() {}
+CandidateTranscript.prototype.tokens = function() {}
 
 /**
  * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
+ * sum of the acoustic model logit values for each timestep/token that
  * contributed to the creation of this transcription.
  *
  * @return {float} Confidence value
@@ -304,14 +309,14 @@ CandidateTranscript.prototype.confidence = function() {}
 /**
  * @class
  *
- * Stores the entire CTC output as an array of character metadata objects
+ * An array of CandidateTranscript objects computed by the model.
  */
 function Metadata () {}
 
 /**
- * List of items
+ * Array of transcripts
  *
- * @return {array} List of :js:func:`CandidateTranscript` objects
+ * @return {array} Array of :js:func:`CandidateTranscript` objects
  */
 Metadata.prototype.transcripts = function() {}
 
diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py
index 5d9072ec..a44cf05f 100644
--- a/native_client/python/__init__.py
+++ b/native_client/python/__init__.py
@@ -123,15 +123,15 @@ class Model(object):
 
     def sttWithMetadata(self, audio_buffer, num_results=1):
         """
-        Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
+        Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.
 
         :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
         :type audio_buffer: numpy.int16 array
 
-        :param num_results: Number of candidate transcripts to return.
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
         :type num_results: int
 
-        :return: Outputs a struct of individual letters along with their timing information.
+        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
         :type: :func:`Metadata`
         """
         return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
@@ -192,10 +192,13 @@ class Stream(object):
 
     def intermediateDecodeWithMetadata(self, num_results=1):
         """
-        Compute the intermediate decoding of an ongoing streaming inference.
+        Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.
 
-        :return: The STT intermediate result.
-        :type: str
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
+        :type num_results: int
+
+        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
+        :type: :func:`Metadata`
 
         :throws: RuntimeError if the stream object is not valid
         """
@@ -205,8 +208,9 @@ class Stream(object):
 
     def finishStream(self):
         """
-        Signal the end of an audio signal to an ongoing streaming inference,
-        returns the STT result over the whole audio signal.
+        Compute the final decoding of an ongoing streaming inference and return
+        the result. Signals the end of an ongoing streaming inference. The underlying
+        stream object must not be used after this method is called.
 
         :return: The STT result.
         :type: str
@@ -221,13 +225,15 @@ class Stream(object):
 
     def finishStreamWithMetadata(self, num_results=1):
         """
-        Signal the end of an audio signal to an ongoing streaming inference,
-        returns per-letter metadata.
+        Compute the final decoding of an ongoing streaming inference and return
+        results including metadata. Signals the end of an ongoing streaming
+        inference. The underlying stream object must not be used after this
+        method is called.
 
-        :param num_results: Number of candidate transcripts to return.
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
         :type num_results: int
 
-        :return: Outputs a struct of individual letters along with their timing information.
+        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
         :type: :func:`Metadata`
 
         :throws: RuntimeError if the stream object is not valid