Merge pull request #2792 from reuben/multiple_transcriptions

Expose multiple transcriptions in "WithMetadata" API
This commit is contained in:
Reuben Morais 2020-03-20 16:58:32 +01:00 коммит произвёл GitHub
Родитель 5e46d702af ee30a1c9de
Коммит 903d0b8fe4
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
41 изменённых файлов: 1006 добавлений и 467 удалений

Просмотреть файл

@ -34,6 +34,9 @@ C
.. doxygenfunction:: DS_IntermediateDecode
:project: deepspeech-c
.. doxygenfunction:: DS_IntermediateDecodeWithMetadata
:project: deepspeech-c
.. doxygenfunction:: DS_FinishStream
:project: deepspeech-c

Просмотреть файл

@ -31,13 +31,20 @@ ErrorCodes
Metadata
--------
.. doxygenstruct:: DeepSpeechClient::Structs::Metadata
.. doxygenclass:: DeepSpeechClient::Models::Metadata
:project: deepspeech-dotnet
:members: items, num_items, confidence
:members: Transcripts
MetadataItem
------------
CandidateTranscript
-------------------
.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem
.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
:project: deepspeech-dotnet
:members: character, timestep, start_time
:members: Tokens, Confidence
TokenMetadata
-------------
.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
:project: deepspeech-dotnet
:members: Text, Timestep, StartTime

Просмотреть файл

@ -13,11 +13,17 @@ Metadata
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
:project: deepspeech-java
:members: getItems, getNum_items, getProbability, getItem
:members: getTranscripts, getNum_transcripts, getTranscript
MetadataItem
------------
CandidateTranscript
-------------------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
:project: deepspeech-java
:members: getCharacter, getTimestep, getStart_time
:members: getTokens, getNum_tokens, getConfidence, getToken
TokenMetadata
-------------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
:project: deepspeech-java
:members: getText, getTimestep, getStart_time

Просмотреть файл

@ -30,8 +30,14 @@ Metadata
.. js:autoclass:: Metadata
:members:
MetadataItem
------------
CandidateTranscript
-------------------
.. js:autoclass:: MetadataItem
.. js:autoclass:: CandidateTranscript
:members:
TokenMetadata
-------------
.. js:autoclass:: TokenMetadata
:members:

Просмотреть файл

@ -21,8 +21,14 @@ Metadata
.. autoclass:: Metadata
:members:
MetadataItem
------------
CandidateTranscript
-------------------
.. autoclass:: MetadataItem
.. autoclass:: CandidateTranscript
:members:
TokenMetadata
-------------
.. autoclass:: TokenMetadata
:members:

Просмотреть файл

@ -8,9 +8,16 @@ Metadata
:project: deepspeech-c
:members:
MetadataItem
------------
CandidateTranscript
-------------------
.. doxygenstruct:: MetadataItem
.. doxygenstruct:: CandidateTranscript
:project: deepspeech-c
:members:
TokenMetadata
-------------
.. doxygenstruct:: TokenMetadata
:project: deepspeech-c
:members:

Просмотреть файл

@ -790,7 +790,7 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

Просмотреть файл

@ -34,6 +34,8 @@ bool extended_metadata = false;
bool json_output = false;
int json_candidate_transcripts = 3;
int stream_size = 0;
void PrintHelp(const char* bin)
@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
"\n"
"Running DeepSpeech inference.\n"
"\n"
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
"\t--scorer SCORER\t\tPath to the external scorer file\n"
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
"\t--extended\t\tOutput string from extended metadata\n"
"\t--json\t\t\tExtended output, shows word timings as JSON\n"
"\t--stream size\t\tRun in stream mode, output intermediate results\n"
"\t--help\t\t\tShow help\n"
"\t--version\t\tPrint version and exits\n";
"\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
"\t--scorer SCORER\t\t\tPath to the external scorer file\n"
"\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
"\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
"\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
"\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
"\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
"\t--extended\t\t\tOutput string from extended metadata\n"
"\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
"\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
"\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
"\t--help\t\t\t\tShow help\n"
"\t--version\t\t\tPrint version and exits\n";
char* version = DS_Version();
std::cerr << "DeepSpeech " << version << "\n";
DS_FreeString(version);
@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
{"t", no_argument, nullptr, 't'},
{"extended", no_argument, nullptr, 'e'},
{"json", no_argument, nullptr, 'j'},
{"candidate_transcripts", required_argument, nullptr, 150},
{"stream", required_argument, nullptr, 's'},
{"version", no_argument, nullptr, 'v'},
{"help", no_argument, nullptr, 'h'},
@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
json_output = true;
break;
case 150:
json_candidate_transcripts = atoi(optarg);
break;
case 's':
stream_size = atoi(optarg);
break;

Просмотреть файл

@ -44,9 +44,115 @@ struct meta_word {
float duration;
};
char* metadataToString(Metadata* metadata);
std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
char* JSONOutput(Metadata* metadata);
char*
CandidateTranscriptToString(const CandidateTranscript* transcript)
{
std::string retval = "";
for (int i = 0; i < transcript->num_tokens; i++) {
const TokenMetadata& token = transcript->tokens[i];
retval += token.text;
}
return strdup(retval.c_str());
}
std::vector<meta_word>
CandidateTranscriptToWords(const CandidateTranscript* transcript)
{
std::vector<meta_word> word_list;
std::string word = "";
float word_start_time = 0;
// Loop through each token
for (int i = 0; i < transcript->num_tokens; i++) {
const TokenMetadata& token = transcript->tokens[i];
// Append token to word if it's not a space
if (strcmp(token.text, u8" ") != 0) {
// Log the start time of the new word
if (word.length() == 0) {
word_start_time = token.start_time;
}
word.append(token.text);
}
// Word boundary is either a space or the last token in the array
if (strcmp(token.text, u8" ") == 0 || i == transcript->num_tokens-1) {
float word_duration = token.start_time - word_start_time;
if (word_duration < 0) {
word_duration = 0;
}
meta_word w;
w.word = word;
w.start_time = word_start_time;
w.duration = word_duration;
word_list.push_back(w);
// Reset
word = "";
word_start_time = 0;
}
}
return word_list;
}
std::string
CandidateTranscriptToJSON(const CandidateTranscript *transcript)
{
std::ostringstream out_string;
std::vector<meta_word> words = CandidateTranscriptToWords(transcript);
out_string << R"("metadata":{"confidence":)" << transcript->confidence << R"(},"words":[)";
for (int i = 0; i < words.size(); i++) {
meta_word w = words[i];
out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
if (i < words.size() - 1) {
out_string << ",";
}
}
out_string << "]";
return out_string.str();
}
char*
MetadataToJSON(Metadata* result)
{
std::ostringstream out_string;
out_string << "{\n";
for (int j=0; j < result->num_transcripts; ++j) {
const CandidateTranscript *transcript = &result->transcripts[j];
if (j == 0) {
out_string << CandidateTranscriptToJSON(transcript);
if (result->num_transcripts > 1) {
out_string << ",\n" << R"("alternatives")" << ":[\n";
}
} else {
out_string << "{" << CandidateTranscriptToJSON(transcript) << "}";
if (j < result->num_transcripts - 1) {
out_string << ",\n";
} else {
out_string << "\n]";
}
}
}
out_string << "\n}\n";
return strdup(out_string.str().c_str());
}
ds_result
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
@ -57,13 +163,13 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
clock_t ds_start_time = clock();
if (extended_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
res.string = metadataToString(metadata);
DS_FreeMetadata(metadata);
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1);
res.string = CandidateTranscriptToString(&result->transcripts[0]);
DS_FreeMetadata(result);
} else if (json_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
res.string = JSONOutput(metadata);
DS_FreeMetadata(metadata);
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
res.string = MetadataToJSON(result);
DS_FreeMetadata(result);
} else if (stream_size > 0) {
StreamingState* ctx;
int status = DS_CreateStream(aCtx, &ctx);
@ -278,87 +384,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
}
}
char*
metadataToString(Metadata* metadata)
{
std::string retval = "";
for (int i = 0; i < metadata->num_items; i++) {
MetadataItem item = metadata->items[i];
retval += item.character;
}
return strdup(retval.c_str());
}
std::vector<meta_word>
WordsFromMetadata(Metadata* metadata)
{
std::vector<meta_word> word_list;
std::string word = "";
float word_start_time = 0;
// Loop through each character
for (int i = 0; i < metadata->num_items; i++) {
MetadataItem item = metadata->items[i];
// Append character to word if it's not a space
if (strcmp(item.character, u8" ") != 0) {
// Log the start time of the new word
if (word.length() == 0) {
word_start_time = item.start_time;
}
word.append(item.character);
}
// Word boundary is either a space or the last character in the array
if (strcmp(item.character, " ") == 0
|| strcmp(item.character, u8" ") == 0
|| i == metadata->num_items-1) {
float word_duration = item.start_time - word_start_time;
if (word_duration < 0) {
word_duration = 0;
}
meta_word w;
w.word = word;
w.start_time = word_start_time;
w.duration = word_duration;
word_list.push_back(w);
// Reset
word = "";
word_start_time = 0;
}
}
return word_list;
}
char*
JSONOutput(Metadata* metadata)
{
std::vector<meta_word> words = WordsFromMetadata(metadata);
std::ostringstream out_string;
out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)";
for (int i = 0; i < words.size(); i++) {
meta_word w = words[i];
out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
if (i < words.size() - 1) {
out_string << ",";
}
}
out_string << "]}\n";
return strdup(out_string.str().c_str());
}
int
main(int argc, char **argv)
{

Просмотреть файл

@ -157,7 +157,7 @@ DecoderState::next(const double *probs,
}
std::vector<Output>
DecoderState::decode() const
DecoderState::decode(size_t num_results) const
{
std::vector<PathTrie*> prefixes_copy = prefixes_;
std::unordered_map<const PathTrie*, float> scores;
@ -181,16 +181,12 @@ DecoderState::decode() const
}
using namespace std::placeholders;
size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_);
size_t num_returned = std::min(prefixes_copy.size(), num_results);
std::partial_sort(prefixes_copy.begin(),
prefixes_copy.begin() + num_prefixes,
prefixes_copy.begin() + num_returned,
prefixes_copy.end(),
std::bind(prefix_compare_external, _1, _2, scores));
//TODO: expose this as an API parameter
const size_t top_paths = 1;
size_t num_returned = std::min(num_prefixes, top_paths);
std::vector<Output> outputs;
outputs.reserve(num_returned);

Просмотреть файл

@ -60,13 +60,16 @@ public:
int time_dim,
int class_dim);
/* Get transcription from current decoder state
/* Get up to num_results transcriptions from current decoder state.
*
* Parameters:
* num_results: Number of beams to return.
*
* Return:
* A vector where each element is a pair of score and decoding result,
* in descending order.
*/
std::vector<Output> decode() const;
std::vector<Output> decode(size_t num_results=1) const;
};

Просмотреть файл

@ -60,7 +60,7 @@ using std::vector;
When batch_buffer is full, we do a single step through the acoustic model
and accumulate the intermediate decoding state in the DecoderState structure.
When finishStream() is called, we return the corresponding transcription from
When finishStream() is called, we return the corresponding transcript from
the current decoder state.
*/
struct StreamingState {
@ -78,9 +78,10 @@ struct StreamingState {
void feedAudioContent(const short* buffer, unsigned int buffer_size);
char* intermediateDecode() const;
Metadata* intermediateDecodeWithMetadata(unsigned int num_results) const;
void finalizeStream();
char* finishStream();
Metadata* finishStreamWithMetadata();
Metadata* finishStreamWithMetadata(unsigned int num_results);
void processAudioWindow(const vector<float>& buf);
void processMfccWindow(const vector<float>& buf);
@ -136,6 +137,12 @@ StreamingState::intermediateDecode() const
return model_->decode(decoder_state_);
}
Metadata*
StreamingState::intermediateDecodeWithMetadata(unsigned int num_results) const
{
return model_->decode_metadata(decoder_state_, num_results);
}
char*
StreamingState::finishStream()
{
@ -144,10 +151,10 @@ StreamingState::finishStream()
}
Metadata*
StreamingState::finishStreamWithMetadata()
StreamingState::finishStreamWithMetadata(unsigned int num_results)
{
finalizeStream();
return model_->decode_metadata(decoder_state_);
return model_->decode_metadata(decoder_state_, num_results);
}
void
@ -402,6 +409,13 @@ DS_IntermediateDecode(const StreamingState* aSctx)
return aSctx->intermediateDecode();
}
Metadata*
DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
unsigned int aNumResults)
{
return aSctx->intermediateDecodeWithMetadata(aNumResults);
}
char*
DS_FinishStream(StreamingState* aSctx)
{
@ -411,11 +425,12 @@ DS_FinishStream(StreamingState* aSctx)
}
Metadata*
DS_FinishStreamWithMetadata(StreamingState* aSctx)
DS_FinishStreamWithMetadata(StreamingState* aSctx,
unsigned int aNumResults)
{
Metadata* metadata = aSctx->finishStreamWithMetadata();
Metadata* result = aSctx->finishStreamWithMetadata(aNumResults);
DS_FreeStream(aSctx);
return metadata;
return result;
}
StreamingState*
@ -444,10 +459,11 @@ DS_SpeechToText(ModelState* aCtx,
Metadata*
DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize)
unsigned int aBufferSize,
unsigned int aNumResults)
{
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
return DS_FinishStreamWithMetadata(ctx);
return DS_FinishStreamWithMetadata(ctx, aNumResults);
}
void
@ -460,11 +476,16 @@ void
DS_FreeMetadata(Metadata* m)
{
if (m) {
for (int i = 0; i < m->num_items; ++i) {
free(m->items[i].character);
for (int i = 0; i < m->num_transcripts; ++i) {
for (int j = 0; j < m->transcripts[i].num_tokens; ++j) {
free((void*)m->transcripts[i].tokens[j].text);
}
free((void*)m->transcripts[i].tokens);
}
delete[] m->items;
delete m;
free((void*)m->transcripts);
free(m);
}
}

Просмотреть файл

@ -20,32 +20,43 @@ typedef struct ModelState ModelState;
typedef struct StreamingState StreamingState;
/**
* @brief Stores each individual character, along with its timing information
* @brief Stores text of an individual token, along with its timing information
*/
typedef struct MetadataItem {
/** The character generated for transcription */
char* character;
typedef struct TokenMetadata {
/** The text corresponding to this token */
const char* const text;
/** Position of the character in units of 20ms */
int timestep;
/** Position of the token in units of 20ms */
const unsigned int timestep;
/** Position of the character in seconds */
float start_time;
} MetadataItem;
/** Position of the token in seconds */
const float start_time;
} TokenMetadata;
/**
* @brief Stores the entire CTC output as an array of character metadata objects
* @brief A single transcript computed by the model, including a confidence
* value and the metadata for its constituent tokens.
*/
typedef struct CandidateTranscript {
/** Array of TokenMetadata objects */
const TokenMetadata* const tokens;
/** Size of the tokens array */
const unsigned int num_tokens;
/** Approximated confidence value for this transcript. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcript.
*/
const double confidence;
} CandidateTranscript;
/**
* @brief An array of CandidateTranscript objects computed by the model.
*/
typedef struct Metadata {
/** List of items */
MetadataItem* items;
/** Size of the list of items */
int num_items;
/** Approximated confidence value for this transcription. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcription.
*/
double confidence;
/** Array of CandidateTranscript objects */
const CandidateTranscript* const transcripts;
/** Size of the transcripts array */
const unsigned int num_transcripts;
} Metadata;
enum DeepSpeech_Error_Codes
@ -164,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
float aBeta);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text.
* @brief Use the DeepSpeech model to convert speech to text.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
@ -180,21 +191,25 @@ char* DS_SpeechToText(ModelState* aCtx,
unsigned int aBufferSize);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
* @brief Use the DeepSpeech model to convert speech to text and output results
* including metadata.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
* @return Metadata struct containing multiple CandidateTranscript structs. Each
* transcript has per-token metadata including timing information. The
* user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error.
*/
DEEPSPEECH_EXPORT
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize);
unsigned int aBufferSize,
unsigned int aNumResults);
/**
* @brief Create a new streaming inference state. The streaming state returned
@ -236,8 +251,24 @@ DEEPSPEECH_EXPORT
char* DS_IntermediateDecode(const StreamingState* aSctx);
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns the STT result over the whole audio signal.
* @brief Compute the intermediate decoding of an ongoing streaming inference,
* return results including metadata.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aNumResults The number of candidate transcripts to return.
*
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information. The user is
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error.
*/
DEEPSPEECH_EXPORT
Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
unsigned int aNumResults);
/**
* @brief Compute the final decoding of an ongoing streaming inference and return
* the result. Signals the end of an ongoing streaming inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
@ -250,18 +281,23 @@ DEEPSPEECH_EXPORT
char* DS_FinishStream(StreamingState* aSctx);
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns per-letter metadata.
* @brief Compute the final decoding of an ongoing streaming inference and return
* results including metadata. Signals the end of an ongoing streaming
* inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aNumResults The number of candidate transcripts to return.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information. The user is
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error.
*
* @note This method will free the state pointer (@p aSctx).
*/
DEEPSPEECH_EXPORT
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
unsigned int aNumResults);
/**
* @brief Destroy a streaming state without decoding the computed logits. This

Просмотреть файл

@ -199,13 +199,14 @@ namespace DeepSpeechClient
}
/// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
/// </summary>
/// <param name="stream">Instance of the stream to finish.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata result.</returns>
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
{
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
}
/// <summary>
@ -218,6 +219,17 @@ namespace DeepSpeechClient
return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
}
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
/// </summary>
/// <param name="stream">Instance of the stream to decode.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The STT intermediate result.</returns>
public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
{
return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
}
/// <summary>
/// Return version of this library. The returned version is a semantic version
/// (SemVer 2.0.0).
@ -261,14 +273,15 @@ namespace DeepSpeechClient
}
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata. Returns NULL on error.</returns>
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
{
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata();
}
#endregion

Просмотреть файл

@ -50,11 +50,13 @@
<Compile Include="Extensions\NativeExtensions.cs" />
<Compile Include="Models\DeepSpeechStream.cs" />
<Compile Include="Models\Metadata.cs" />
<Compile Include="Models\MetadataItem.cs" />
<Compile Include="Models\CandidateTranscript.cs" />
<Compile Include="Models\TokenMetadata.cs" />
<Compile Include="NativeImp.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Structs\Metadata.cs" />
<Compile Include="Structs\MetadataItem.cs" />
<Compile Include="Structs\CandidateTranscript.cs" />
<Compile Include="Structs\TokenMetadata.cs" />
</ItemGroup>
<ItemGroup />
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />

Просмотреть файл

@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions
}
/// <summary>
/// Converts a pointer into managed metadata object.
/// Converts a pointer into managed TokenMetadata object.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <returns>TokenMetadata managed object.</returns>
private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr)
{
var token = Marshal.PtrToStructure<TokenMetadata>(intPtr);
var managedToken = new Models.TokenMetadata
{
Timestep = token.timestep,
StartTime = token.start_time,
Text = token.text.PtrToString(releasePtr: false)
};
return managedToken;
}
/// <summary>
/// Converts a pointer into managed CandidateTranscript object.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <returns>CandidateTranscript managed object.</returns>
private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr)
{
var managedTranscript = new Models.CandidateTranscript();
var transcript = Marshal.PtrToStructure<CandidateTranscript>(intPtr);
managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens];
managedTranscript.Confidence = transcript.confidence;
//we need to manually read each item from the native ptr using its size
var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata));
for (int i = 0; i < transcript.num_tokens; i++)
{
managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata();
transcript.tokens += sizeOfTokenMetadata;
}
return managedTranscript;
}
/// <summary>
/// Converts a pointer into managed Metadata object.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <returns>Metadata managed object.</returns>
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
{
var managedMetaObject = new Models.Metadata();
var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
managedMetaObject.Confidence = metaData.confidence;
var managedMetadata = new Models.Metadata();
var metadata = Marshal.PtrToStructure<Metadata>(intPtr);
managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts];
//we need to manually read each item from the native ptr using its size
var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
for (int i = 0; i < metaData.num_items; i++)
var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript));
for (int i = 0; i < metadata.num_transcripts; i++)
{
var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
managedMetaObject.Items[i] = new Models.MetadataItem
{
Timestep = tempItem.timestep,
StartTime = tempItem.start_time,
Character = tempItem.character.PtrToString(releasePtr: false)
};
//we keep the offset on each read
metaData.items += sizeOfMetaItem;
managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript();
metadata.transcripts += sizeOfCandidateTranscript;
}
NativeImp.DS_FreeMetadata(intPtr);
return managedMetaObject;
return managedMetadata;
}
}
}

Просмотреть файл

@ -68,13 +68,15 @@ namespace DeepSpeechClient.Interfaces
uint aBufferSize);
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata. Returns NULL on error.</returns>
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
uint aBufferSize);
uint aBufferSize,
uint aNumResults);
/// <summary>
/// Destroy a streaming state without decoding the computed logits.
@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces
/// <returns>The STT intermediate result.</returns>
unsafe string IntermediateDecode(DeepSpeechStream stream);
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
/// </summary>
/// <param name="stream">Instance of the stream to decode.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata result.</returns>
unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
/// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// </summary>
@ -110,10 +120,11 @@ namespace DeepSpeechClient.Interfaces
unsafe string FinishStream(DeepSpeechStream stream);
/// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
/// </summary>
/// <param name="stream">Instance of the stream to finish.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata result.</returns>
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
}
}

Просмотреть файл

@ -0,0 +1,17 @@
namespace DeepSpeechClient.Models
{
/// <summary>
/// Stores the entire CTC output as an array of character metadata objects.
/// </summary>
public class CandidateTranscript
{
/// <summary>
/// Approximated confidence value for this transcription.
/// </summary>
public double Confidence { get; set; }
/// <summary>
/// List of metada tokens containing text, timestep, and time offset.
/// </summary>
public TokenMetadata[] Tokens { get; set; }
}
}

Просмотреть файл

@ -6,12 +6,8 @@
public class Metadata
{
/// <summary>
/// Approximated confidence value for this transcription.
/// List of candidate transcripts.
/// </summary>
public double Confidence { get; set; }
/// <summary>
/// List of metada items containing char, timespet, and time offset.
/// </summary>
public MetadataItem[] Items { get; set; }
public CandidateTranscript[] Transcripts { get; set; }
}
}

Просмотреть файл

@ -3,12 +3,12 @@
/// <summary>
/// Stores each individual character, along with its timing information.
/// </summary>
public class MetadataItem
public class TokenMetadata
{
/// <summary>
/// Char of the current timestep.
/// </summary>
public string Character;
public string Text;
/// <summary>
/// Position of the character in units of 20ms.
/// </summary>

Просмотреть файл

@ -17,45 +17,46 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
ref IntPtr** pint);
ref IntPtr** pint);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern uint DS_GetModelBeamWidth(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern ErrorCodes DS_SetModelBeamWidth(IntPtr** aCtx,
uint aBeamWidth);
uint aBeamWidth);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
uint aBeamWidth,
ref IntPtr** pint);
uint aBeamWidth,
ref IntPtr** pint);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx,
string aScorerPath);
string aScorerPath);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx,
float aAlpha,
float aBeta);
float aAlpha,
float aBeta);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx,
short[] aBuffer,
uint aBufferSize);
short[] aBuffer,
uint aBufferSize);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
short[] aBuffer,
uint aBufferSize);
short[] aBuffer,
uint aBufferSize,
uint aNumResults);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
@ -76,18 +77,23 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern void DS_FeedAudioContent(IntPtr** aSctx,
short[] aBuffer,
uint aBufferSize);
short[] aBuffer,
uint aBufferSize);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx,
uint aNumResults);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx);
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx,
uint aNumResults);
#endregion
}
}

Просмотреть файл

@ -0,0 +1,22 @@
using System;
using System.Runtime.InteropServices;
namespace DeepSpeechClient.Structs
{
[StructLayout(LayoutKind.Sequential)]
internal unsafe struct CandidateTranscript
{
/// <summary>
/// Native list of tokens.
/// </summary>
internal unsafe IntPtr tokens;
/// <summary>
/// Count of tokens from the native side.
/// </summary>
internal unsafe int num_tokens;
/// <summary>
/// Approximated confidence value for this transcription.
/// </summary>
internal unsafe double confidence;
}
}

Просмотреть файл

@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs
internal unsafe struct Metadata
{
/// <summary>
/// Native list of items.
/// Native list of candidate transcripts.
/// </summary>
internal unsafe IntPtr items;
internal unsafe IntPtr transcripts;
/// <summary>
/// Count of items from the native side.
/// Count of transcripts from the native side.
/// </summary>
internal unsafe int num_items;
/// <summary>
/// Approximated confidence value for this transcription.
/// </summary>
internal unsafe double confidence;
internal unsafe int num_transcripts;
}
}

Просмотреть файл

@ -4,12 +4,12 @@ using System.Runtime.InteropServices;
namespace DeepSpeechClient.Structs
{
[StructLayout(LayoutKind.Sequential)]
internal unsafe struct MetadataItem
internal unsafe struct TokenMetadata
{
/// <summary>
/// Native character.
/// Native text.
/// </summary>
internal unsafe IntPtr character;
internal unsafe IntPtr text;
/// <summary>
/// Position of the character in units of 20ms.
/// </summary>

Просмотреть файл

@ -21,14 +21,14 @@ namespace CSharpExamples
static string GetArgument(IEnumerable<string> args, string option)
=> args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
static string MetadataToString(Metadata meta)
static string MetadataToString(CandidateTranscript transcript)
{
var nl = Environment.NewLine;
string retval =
Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}"
+ $"Confidence: {meta?.Confidence} {nl}"
+ $"Item count: {meta?.Items?.Length} {nl}"
+ string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}"
+ $"Confidence: {transcript?.Confidence} {nl}"
+ $"Item count: {transcript?.Tokens?.Length} {nl}"
+ string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}"));
return retval;
}
@ -75,8 +75,8 @@ namespace CSharpExamples
if (extended)
{
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
Convert.ToUInt32(waveBuffer.MaxSize / 2));
speechResult = MetadataToString(metaResult);
Convert.ToUInt32(waveBuffer.MaxSize / 2), 1);
speechResult = MetadataToString(metaResult.Transcripts[0]);
}
else
{

Просмотреть файл

@ -6,6 +6,8 @@
%}
%include "typemaps.i"
%include "enums.swg"
%javaconst(1);
%include "arrays_java.i"
// apply to DS_FeedAudioContent and DS_SpeechToText
@ -15,21 +17,29 @@
%pointer_functions(ModelState*, modelstatep);
%pointer_functions(StreamingState*, streamingstatep);
%typemap(newfree) char* "DS_FreeString($1);";
%include "carrays.i"
%array_functions(struct MetadataItem, metadataItem_array);
%extend struct CandidateTranscript {
/**
* Retrieve one TokenMetadata element
*
* @param i Array index of the TokenMetadata to get
*
* @return The TokenMetadata requested or null
*/
const TokenMetadata& getToken(int i) {
return self->tokens[i];
}
}
%extend struct Metadata {
/**
* Retrieve one MetadataItem element
* Retrieve one CandidateTranscript element
*
* @param i Array index of the MetadataItem to get
* @param i Array index of the CandidateTranscript to get
*
* @return The MetadataItem requested or null
* @return The CandidateTranscript requested or null
*/
MetadataItem getItem(int i) {
return metadataItem_array_getitem(self->items, i);
const CandidateTranscript& getTranscript(int i) {
return self->transcripts[i];
}
~Metadata() {
@ -37,14 +47,18 @@
}
}
%nodefaultdtor Metadata;
%nodefaultctor Metadata;
%nodefaultctor MetadataItem;
%nodefaultdtor MetadataItem;
%nodefaultdtor Metadata;
%nodefaultctor CandidateTranscript;
%nodefaultdtor CandidateTranscript;
%nodefaultctor TokenMetadata;
%nodefaultdtor TokenMetadata;
%typemap(newfree) char* "DS_FreeString($1);";
%newobject DS_SpeechToText;
%newobject DS_IntermediateDecode;
%newobject DS_FinishStream;
%newobject DS_ErrorCodeToErrorMessage;
%rename ("%(strip:[DS_])s") "";

Просмотреть файл

@ -12,7 +12,7 @@ import org.junit.runners.MethodSorters;
import static org.junit.Assert.*;
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
import org.mozilla.deepspeech.libdeepspeech.Metadata;
import org.mozilla.deepspeech.libdeepspeech.CandidateTranscript;
import java.io.RandomAccessFile;
import java.io.FileNotFoundException;
@ -61,10 +61,10 @@ public class BasicTest {
m.freeModel();
}
private String metadataToString(Metadata m) {
private String candidateTranscriptToString(CandidateTranscript t) {
String retval = "";
for (int i = 0; i < m.getNum_items(); ++i) {
retval += m.getItem(i).getCharacter();
for (int i = 0; i < t.getNum_tokens(); ++i) {
retval += t.getToken(i).getText();
}
return retval;
}
@ -97,7 +97,7 @@ public class BasicTest {
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
if (extendedMetadata) {
return metadataToString(m.sttWithMetadata(shorts, shorts.length));
return candidateTranscriptToString(m.sttWithMetadata(shorts, shorts.length, 1).getTranscript(0));
} else {
return m.stt(shorts, shorts.length);
}

Просмотреть файл

@ -11,8 +11,15 @@ public class DeepSpeechModel {
}
// FIXME: We should have something better than those SWIGTYPE_*
SWIGTYPE_p_p_ModelState _mspp;
SWIGTYPE_p_ModelState _msp;
private SWIGTYPE_p_p_ModelState _mspp;
private SWIGTYPE_p_ModelState _msp;
private void evaluateErrorCode(int errorCode) {
DeepSpeech_Error_Codes code = DeepSpeech_Error_Codes.swigToEnum(errorCode);
if (code != DeepSpeech_Error_Codes.ERR_OK) {
throw new RuntimeException("Error: " + impl.ErrorCodeToErrorMessage(errorCode) + " (0x" + Integer.toHexString(errorCode) + ").");
}
}
/**
* @brief An object providing an interface to a trained DeepSpeech model.
@ -20,10 +27,12 @@ public class DeepSpeechModel {
* @constructor
*
* @param modelPath The path to the frozen model graph.
*
* @throws RuntimeException on failure.
*/
public DeepSpeechModel(String modelPath) {
this._mspp = impl.new_modelstatep();
impl.CreateModel(modelPath, this._mspp);
evaluateErrorCode(impl.CreateModel(modelPath, this._mspp));
this._msp = impl.modelstatep_value(this._mspp);
}
@ -43,10 +52,10 @@ public class DeepSpeechModel {
* @param aBeamWidth The beam width used by the model. A larger beam width value
* generates better results at the cost of decoding time.
*
* @return Zero on success, non-zero on failure.
* @throws RuntimeException on failure.
*/
public int setBeamWidth(long beamWidth) {
return impl.SetModelBeamWidth(this._msp, beamWidth);
public void setBeamWidth(long beamWidth) {
evaluateErrorCode(impl.SetModelBeamWidth(this._msp, beamWidth));
}
/**
@ -70,19 +79,19 @@ public class DeepSpeechModel {
*
* @param scorer The path to the external scorer file.
*
* @return Zero on success, non-zero on failure (invalid arguments).
* @throws RuntimeException on failure.
*/
public void enableExternalScorer(String scorer) {
impl.EnableExternalScorer(this._msp, scorer);
evaluateErrorCode(impl.EnableExternalScorer(this._msp, scorer));
}
/**
* @brief Disable decoding using an external scorer.
*
* @return Zero on success, non-zero on failure (invalid arguments).
* @throws RuntimeException on failure.
*/
public void disableExternalScorer() {
impl.DisableExternalScorer(this._msp);
evaluateErrorCode(impl.DisableExternalScorer(this._msp));
}
/**
@ -91,10 +100,10 @@ public class DeepSpeechModel {
* @param alpha The alpha hyperparameter of the decoder. Language model weight.
* @param beta The beta hyperparameter of the decoder. Word insertion weight.
*
* @return Zero on success, non-zero on failure (invalid arguments).
* @throws RuntimeException on failure.
*/
public void setScorerAlphaBeta(float alpha, float beta) {
impl.SetScorerAlphaBeta(this._msp, alpha, beta);
evaluateErrorCode(impl.SetScorerAlphaBeta(this._msp, alpha, beta));
}
/*
@ -117,11 +126,13 @@ public class DeepSpeechModel {
* @param buffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in the audio signal.
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
*
* @return Outputs a Metadata object of individual letters along with their timing information.
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information.
*/
public Metadata sttWithMetadata(short[] buffer, int buffer_size) {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size);
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
}
/**
@ -130,10 +141,12 @@ public class DeepSpeechModel {
* and finishStream().
*
* @return An opaque object that represents the streaming state.
*
* @throws RuntimeException on failure.
*/
public DeepSpeechStreamingState createStream() {
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
impl.CreateStream(this._msp, ssp);
evaluateErrorCode(impl.CreateStream(this._msp, ssp));
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
}
@ -161,8 +174,20 @@ public class DeepSpeechModel {
}
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns the STT result over the whole audio signal.
* @brief Compute the intermediate decoding of an ongoing streaming inference.
*
* @param ctx A streaming state pointer returned by createStream().
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
*
* @return The STT intermediate result.
*/
public Metadata intermediateDecodeWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
return impl.IntermediateDecodeWithMetadata(ctx.get(), num_results);
}
/**
* @brief Compute the final decoding of an ongoing streaming inference and return
* the result. Signals the end of an ongoing streaming inference.
*
* @param ctx A streaming state pointer returned by createStream().
*
@ -175,16 +200,19 @@ public class DeepSpeechModel {
}
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns per-letter metadata.
* @brief Compute the final decoding of an ongoing streaming inference and return
* the results including metadata. Signals the end of an ongoing streaming
* inference.
*
* @param ctx A streaming state pointer returned by createStream().
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
*
* @return Outputs a Metadata object of individual letters along with their timing information.
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information.
*
* @note This method will free the state pointer (@p ctx).
*/
public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) {
return impl.FinishStreamWithMetadata(ctx.get());
public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
return impl.FinishStreamWithMetadata(ctx.get(), num_results);
}
}

Просмотреть файл

@ -0,0 +1,73 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */
package org.mozilla.deepspeech.libdeepspeech;
/**
* A single transcript computed by the model, including a confidence<br>
* value and the metadata for its constituent tokens.
*/
public class CandidateTranscript {
private transient long swigCPtr;
protected transient boolean swigCMemOwn;
protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
swigCMemOwn = cMemoryOwn;
swigCPtr = cPtr;
}
protected static long getCPtr(CandidateTranscript obj) {
return (obj == null) ? 0 : obj.swigCPtr;
}
public synchronized void delete() {
if (swigCPtr != 0) {
if (swigCMemOwn) {
swigCMemOwn = false;
throw new UnsupportedOperationException("C++ destructor does not have public access");
}
swigCPtr = 0;
}
}
/**
* Array of TokenMetadata objects
*/
public TokenMetadata getTokens() {
long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
}
/**
* Size of the tokens array
*/
public long getNum_tokens() {
return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
}
/**
* Approximated confidence value for this transcript. This is roughly the<br>
* sum of the acoustic model logit values for each timestep/character that<br>
* contributed to the creation of this transcript.
*/
public double getConfidence() {
return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
}
/**
* Retrieve one TokenMetadata element<br>
* <br>
* @param i Array index of the TokenMetadata to get<br>
* <br>
* @return The TokenMetadata requested or null
*/
public TokenMetadata getToken(int i) {
return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), false);
}
}

Просмотреть файл

@ -0,0 +1,65 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */
package org.mozilla.deepspeech.libdeepspeech;
public enum DeepSpeech_Error_Codes {
ERR_OK(0x0000),
ERR_NO_MODEL(0x1000),
ERR_INVALID_ALPHABET(0x2000),
ERR_INVALID_SHAPE(0x2001),
ERR_INVALID_SCORER(0x2002),
ERR_MODEL_INCOMPATIBLE(0x2003),
ERR_SCORER_NOT_ENABLED(0x2004),
ERR_FAIL_INIT_MMAP(0x3000),
ERR_FAIL_INIT_SESS(0x3001),
ERR_FAIL_INTERPRETER(0x3002),
ERR_FAIL_RUN_SESS(0x3003),
ERR_FAIL_CREATE_STREAM(0x3004),
ERR_FAIL_READ_PROTOBUF(0x3005),
ERR_FAIL_CREATE_SESS(0x3006),
ERR_FAIL_CREATE_MODEL(0x3007);
public final int swigValue() {
return swigValue;
}
public static DeepSpeech_Error_Codes swigToEnum(int swigValue) {
DeepSpeech_Error_Codes[] swigValues = DeepSpeech_Error_Codes.class.getEnumConstants();
if (swigValue < swigValues.length && swigValue >= 0 && swigValues[swigValue].swigValue == swigValue)
return swigValues[swigValue];
for (DeepSpeech_Error_Codes swigEnum : swigValues)
if (swigEnum.swigValue == swigValue)
return swigEnum;
throw new IllegalArgumentException("No enum " + DeepSpeech_Error_Codes.class + " with value " + swigValue);
}
@SuppressWarnings("unused")
private DeepSpeech_Error_Codes() {
this.swigValue = SwigNext.next++;
}
@SuppressWarnings("unused")
private DeepSpeech_Error_Codes(int swigValue) {
this.swigValue = swigValue;
SwigNext.next = swigValue+1;
}
@SuppressWarnings("unused")
private DeepSpeech_Error_Codes(DeepSpeech_Error_Codes swigEnum) {
this.swigValue = swigEnum.swigValue;
SwigNext.next = this.swigValue+1;
}
private final int swigValue;
private static class SwigNext {
private static int next = 0;
}
}

Просмотреть файл

@ -1,6 +1,6 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.2
* Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
@ -9,7 +9,7 @@
package org.mozilla.deepspeech.libdeepspeech;
/**
* Stores the entire CTC output as an array of character metadata objects
* An array of CandidateTranscript objects computed by the model.
*/
public class Metadata {
private transient long swigCPtr;
@ -40,61 +40,29 @@ public class Metadata {
}
/**
* List of items
* Array of CandidateTranscript objects
*/
public void setItems(MetadataItem value) {
implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
public CandidateTranscript getTranscripts() {
long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
}
/**
* List of items
* Size of the transcripts array
*/
public MetadataItem getItems() {
long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
public long getNum_transcripts() {
return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
}
/**
* Size of the list of items
*/
public void setNum_items(int value) {
implJNI.Metadata_num_items_set(swigCPtr, this, value);
}
/**
* Size of the list of items
*/
public int getNum_items() {
return implJNI.Metadata_num_items_get(swigCPtr, this);
}
/**
* Approximated confidence value for this transcription. This is roughly the<br>
* sum of the acoustic model logit values for each timestep/character that<br>
* contributed to the creation of this transcription.
*/
public void setConfidence(double value) {
implJNI.Metadata_confidence_set(swigCPtr, this, value);
}
/**
* Approximated confidence value for this transcription. This is roughly the<br>
* sum of the acoustic model logit values for each timestep/character that<br>
* contributed to the creation of this transcription.
*/
public double getConfidence() {
return implJNI.Metadata_confidence_get(swigCPtr, this);
}
/**
* Retrieve one MetadataItem element<br>
* Retrieve one CandidateTranscript element<br>
* <br>
* @param i Array index of the MetadataItem to get<br>
* @param i Array index of the CandidateTranscript to get<br>
* <br>
* @return The MetadataItem requested or null
* @return The CandidateTranscript requested or null
*/
public MetadataItem getItem(int i) {
return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
public CandidateTranscript getTranscript(int i) {
return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), false);
}
}

Просмотреть файл

@ -4,7 +4,7 @@ Javadoc for Sphinx
This code is only here for reference for documentation generation.
To update, please build SWIG (4.0 at least) and then run from native_client/java:
To update, please install SWIG (4.0 at least) and then run from native_client/java:
.. code-block::

Просмотреть файл

@ -0,0 +1,58 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */
package org.mozilla.deepspeech.libdeepspeech;
/**
* Stores text of an individual token, along with its timing information
*/
public class TokenMetadata {
private transient long swigCPtr;
protected transient boolean swigCMemOwn;
protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
swigCMemOwn = cMemoryOwn;
swigCPtr = cPtr;
}
protected static long getCPtr(TokenMetadata obj) {
return (obj == null) ? 0 : obj.swigCPtr;
}
public synchronized void delete() {
if (swigCPtr != 0) {
if (swigCMemOwn) {
swigCMemOwn = false;
throw new UnsupportedOperationException("C++ destructor does not have public access");
}
swigCPtr = 0;
}
}
/**
* The text corresponding to this token
*/
public String getText() {
return implJNI.TokenMetadata_text_get(swigCPtr, this);
}
/**
* Position of the token in units of 20ms
*/
public long getTimestep() {
return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
}
/**
* Position of the token in seconds
*/
public float getStart_time() {
return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
}
}

Просмотреть файл

@ -42,12 +42,11 @@ function totalTime(hrtimeValue) {
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
}
function metadataToString(metadata) {
function candidateTranscriptToString(transcript) {
var retval = ""
for (var i = 0; i < metadata.num_items; ++i) {
retval += metadata.items[i].character;
for (var i = 0; i < transcript.tokens.length; ++i) {
retval += transcript.tokens[i].text;
}
Ds.FreeMetadata(metadata);
return retval;
}
@ -117,7 +116,9 @@ audioStream.on('finish', () => {
const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate);
if (args['extended']) {
console.log(metadataToString(model.sttWithMetadata(audioBuffer)));
let metadata = model.sttWithMetadata(audioBuffer, 1);
console.log(candidateTranscriptToString(metadata.transcripts[0]));
Ds.FreeMetadata(metadata);
} else {
console.log(model.stt(audioBuffer));
}

Просмотреть файл

@ -47,8 +47,8 @@ using namespace node;
%typemap(argout) ModelState **retval {
$result = SWIGV8_ARRAY_NEW();
SWIGV8_AppendOutput($result, SWIG_From_int(result));
// owned by SWIG, ModelState destructor gets called when the JavaScript object is finalized (see below)
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, SWIG_POINTER_OWN));
// owned by the application. NodeJS does not guarantee the finalizer will be called so applications must call FreeMetadata themselves.
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
}
@ -68,27 +68,29 @@ using namespace node;
%nodefaultctor ModelState;
%nodefaultdtor ModelState;
%typemap(out) MetadataItem* %{
%typemap(out) TokenMetadata* %{
$result = SWIGV8_ARRAY_NEW();
for (int i = 0; i < arg1->num_items; ++i) {
SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_MetadataItem, SWIG_POINTER_OWN));
for (int i = 0; i < arg1->num_tokens; ++i) {
SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_TokenMetadata, 0));
}
%}
%nodefaultdtor Metadata;
%nodefaultctor Metadata;
%nodefaultctor MetadataItem;
%nodefaultdtor MetadataItem;
%extend struct Metadata {
~Metadata() {
DS_FreeMetadata($self);
%typemap(out) CandidateTranscript* %{
$result = SWIGV8_ARRAY_NEW();
for (int i = 0; i < arg1->num_transcripts; ++i) {
SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_CandidateTranscript, 0));
}
}
%}
%extend struct MetadataItem {
~MetadataItem() { }
}
%ignore Metadata::num_transcripts;
%ignore CandidateTranscript::num_tokens;
%nodefaultctor Metadata;
%nodefaultdtor Metadata;
%nodefaultctor CandidateTranscript;
%nodefaultdtor CandidateTranscript;
%nodefaultctor TokenMetadata;
%nodefaultdtor TokenMetadata;
%rename ("%(strip:[DS_])s") "";

Просмотреть файл

@ -115,15 +115,16 @@ Model.prototype.stt = function(aBuffer) {
}
/**
* Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
* Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
*
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
Model.prototype.sttWithMetadata = function(aBuffer) {
return binding.SpeechToTextWithMetadata(this._impl, aBuffer);
Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
aNumResults = aNumResults || 1;
return binding.SpeechToTextWithMetadata(this._impl, aBuffer, aNumResults);
}
/**
@ -172,7 +173,19 @@ Stream.prototype.intermediateDecode = function() {
}
/**
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
* Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
*
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
*
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
aNumResults = aNumResults || 1;
return binding.IntermediateDecode(this._impl, aNumResults);
}
/**
* Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
*
* @return {string} The STT result.
*
@ -185,14 +198,17 @@ Stream.prototype.finishStream = function() {
}
/**
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
* Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
*
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
*
* This method will free the stream, it must not be used after this method is called.
*/
Stream.prototype.finishStreamWithMetadata = function() {
result = binding.FinishStreamWithMetadata(this._impl);
Stream.prototype.finishStreamWithMetadata = function(aNumResults) {
aNumResults = aNumResults || 1;
result = binding.FinishStreamWithMetadata(this._impl, aNumResults);
this._impl = null;
return result;
}
@ -236,70 +252,80 @@ function Version() {
}
//// Metadata and MetadataItem are here only for documentation purposes
//// Metadata, CandidateTranscript and TokenMetadata are here only for documentation purposes
/**
* @class
*
* Stores each individual character, along with its timing information
* Stores text of an individual token, along with its timing information
*/
function MetadataItem() {}
function TokenMetadata() {}
/**
* The character generated for transcription
* The text corresponding to this token
*
* @return {string} The character generated
* @return {string} The text generated
*/
MetadataItem.prototype.character = function() {}
TokenMetadata.prototype.text = function() {}
/**
* Position of the character in units of 20ms
* Position of the token in units of 20ms
*
* @return {int} The position of the character
* @return {int} The position of the token
*/
MetadataItem.prototype.timestep = function() {};
TokenMetadata.prototype.timestep = function() {};
/**
* Position of the character in seconds
* Position of the token in seconds
*
* @return {float} The position of the character
* @return {float} The position of the token
*/
MetadataItem.prototype.start_time = function() {};
TokenMetadata.prototype.start_time = function() {};
/**
* @class
*
* Stores the entire CTC output as an array of character metadata objects
* A single transcript computed by the model, including a confidence value and
* the metadata for its constituent tokens.
*/
function Metadata () {}
function CandidateTranscript () {}
/**
* List of items
* Array of tokens
*
* @return {array} List of :js:func:`MetadataItem`
* @return {array} Array of :js:func:`TokenMetadata`
*/
Metadata.prototype.items = function() {}
/**
* Size of the list of items
*
* @return {int} Number of items
*/
Metadata.prototype.num_items = function() {}
CandidateTranscript.prototype.tokens = function() {}
/**
* Approximated confidence value for this transcription. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* sum of the acoustic model logit values for each timestep/token that
* contributed to the creation of this transcription.
*
* @return {float} Confidence value
*/
Metadata.prototype.confidence = function() {}
CandidateTranscript.prototype.confidence = function() {}
/**
* @class
*
* An array of CandidateTranscript objects computed by the model.
*/
function Metadata () {}
/**
* Array of transcripts
*
* @return {array} Array of :js:func:`CandidateTranscript` objects
*/
Metadata.prototype.transcripts = function() {}
module.exports = {
Model: Model,
Metadata: Metadata,
MetadataItem: MetadataItem,
CandidateTranscript: CandidateTranscript,
TokenMetadata: TokenMetadata,
Version: Version,
FreeModel: FreeModel,
FreeStream: FreeStream,

Просмотреть файл

@ -37,27 +37,39 @@ ModelState::decode(const DecoderState& state) const
}
Metadata*
ModelState::decode_metadata(const DecoderState& state)
ModelState::decode_metadata(const DecoderState& state,
size_t num_results)
{
vector<Output> out = state.decode();
vector<Output> out = state.decode(num_results);
unsigned int num_returned = out.size();
std::unique_ptr<Metadata> metadata(new Metadata());
metadata->num_items = out[0].tokens.size();
metadata->confidence = out[0].confidence;
CandidateTranscript* transcripts = (CandidateTranscript*)malloc(sizeof(CandidateTranscript)*num_returned);
std::unique_ptr<MetadataItem[]> items(new MetadataItem[metadata->num_items]());
for (int i = 0; i < num_returned; ++i) {
TokenMetadata* tokens = (TokenMetadata*)malloc(sizeof(TokenMetadata)*out[i].tokens.size());
// Loop through each character
for (int i = 0; i < out[0].tokens.size(); ++i) {
items[i].character = strdup(alphabet_.StringFromLabel(out[0].tokens[i]).c_str());
items[i].timestep = out[0].timesteps[i];
items[i].start_time = out[0].timesteps[i] * ((float)audio_win_step_ / sample_rate_);
if (items[i].start_time < 0) {
items[i].start_time = 0;
for (int j = 0; j < out[i].tokens.size(); ++j) {
TokenMetadata token {
strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str()), // text
static_cast<unsigned int>(out[i].timesteps[j]), // timestep
out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_), // start_time
};
memcpy(&tokens[j], &token, sizeof(TokenMetadata));
}
CandidateTranscript transcript {
tokens, // tokens
static_cast<unsigned int>(out[i].tokens.size()), // num_tokens
out[i].confidence, // confidence
};
memcpy(&transcripts[i], &transcript, sizeof(CandidateTranscript));
}
metadata->items = items.release();
return metadata.release();
Metadata* ret = (Metadata*)malloc(sizeof(Metadata));
Metadata metadata {
transcripts, // transcripts
num_returned, // num_transcripts
};
memcpy(ret, &metadata, sizeof(Metadata));
return ret;
}

Просмотреть файл

@ -66,11 +66,14 @@ struct ModelState {
* @brief Return character-level metadata including letter timings.
*
* @param state Decoder state to use when decoding.
* @param num_results Maximum number of candidate results to return.
*
* @return Metadata struct containing MetadataItem structs for each character.
* The user is responsible for freeing Metadata by calling DS_FreeMetadata().
* @return A Metadata struct containing CandidateTranscript structs.
* Each represents an candidate transcript, with the first ranked most probable.
* The user is responsible for freeing Result by calling DS_FreeMetadata().
*/
virtual Metadata* decode_metadata(const DecoderState& state);
virtual Metadata* decode_metadata(const DecoderState& state,
size_t num_results);
};
#endif // MODELSTATE_H

Просмотреть файл

@ -121,17 +121,20 @@ class Model(object):
"""
return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
def sttWithMetadata(self, audio_buffer):
def sttWithMetadata(self, audio_buffer, num_results=1):
"""
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type audio_buffer: numpy.int16 array
:return: Outputs a struct of individual letters along with their timing information.
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata`
"""
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer)
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
def createStream(self):
"""
@ -187,10 +190,27 @@ class Stream(object):
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
return deepspeech.impl.IntermediateDecode(self._impl)
def intermediateDecodeWithMetadata(self, num_results=1):
"""
Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata`
:throws: RuntimeError if the stream object is not valid
"""
if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
return deepspeech.impl.IntermediateDecodeWithMetadata(self._impl, num_results)
def finishStream(self):
"""
Signal the end of an audio signal to an ongoing streaming inference,
returns the STT result over the whole audio signal.
Compute the final decoding of an ongoing streaming inference and return
the result. Signals the end of an ongoing streaming inference. The underlying
stream object must not be used after this method is called.
:return: The STT result.
:type: str
@ -203,19 +223,24 @@ class Stream(object):
self._impl = None
return result
def finishStreamWithMetadata(self):
def finishStreamWithMetadata(self, num_results=1):
"""
Signal the end of an audio signal to an ongoing streaming inference,
returns per-letter metadata.
Compute the final decoding of an ongoing streaming inference and return
results including metadata. Signals the end of an ongoing streaming
inference. The underlying stream object must not be used after this
method is called.
:return: Outputs a struct of individual letters along with their timing information.
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata`
:throws: RuntimeError if the stream object is not valid
"""
if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
result = deepspeech.impl.FinishStreamWithMetadata(self._impl)
result = deepspeech.impl.FinishStreamWithMetadata(self._impl, num_results)
self._impl = None
return result
@ -233,52 +258,43 @@ class Stream(object):
# This is only for documentation purpose
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
class MetadataItem(object):
# Metadata, CandidateTranscript and TokenMetadata should be in sync with native_client/deepspeech.h
class TokenMetadata(object):
"""
Stores each individual character, along with its timing information
"""
def character(self):
def text(self):
"""
The character generated for transcription
The text for this token
"""
def timestep(self):
"""
Position of the character in units of 20ms
Position of the token in units of 20ms
"""
def start_time(self):
"""
Position of the character in seconds
Position of the token in seconds
"""
class Metadata(object):
class CandidateTranscript(object):
"""
Stores the entire CTC output as an array of character metadata objects
"""
def items(self):
def tokens(self):
"""
List of items
List of tokens
:return: A list of :func:`MetadataItem` elements
:return: A list of :func:`TokenMetadata` elements
:type: list
"""
def num_items(self):
"""
Size of the list of items
:return: Size of the list of items
:type: int
"""
def confidence(self):
"""
Approximated confidence value for this transcription. This is roughly the
@ -286,3 +302,12 @@ class Metadata(object):
contributed to the creation of this transcription.
"""
class Metadata(object):
def transcripts(self):
"""
List of candidate transcripts
:return: A list of :func:`CandidateTranscript` objects
:type: list
"""

Просмотреть файл

@ -18,6 +18,7 @@ try:
except ImportError:
from pipes import quote
def convert_samplerate(audio_path, desired_sample_rate):
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
try:
@ -31,25 +32,25 @@ def convert_samplerate(audio_path, desired_sample_rate):
def metadata_to_string(metadata):
return ''.join(item.character for item in metadata.items)
return ''.join(token.text for token in metadata.tokens)
def words_from_metadata(metadata):
def words_from_candidate_transcript(metadata):
word = ""
word_list = []
word_start_time = 0
# Loop through each character
for i in range(0, metadata.num_items):
item = metadata.items[i]
for i, token in enumerate(metadata.tokens):
# Append character to word if it's not a space
if item.character != " ":
if token.text != " ":
if len(word) == 0:
# Log the start time of the new word
word_start_time = item.start_time
word_start_time = token.start_time
word = word + item.character
word = word + token.text
# Word boundary is either a space or the last character in the array
if item.character == " " or i == metadata.num_items - 1:
word_duration = item.start_time - word_start_time
if token.text == " " or i == len(metadata.tokens) - 1:
word_duration = token.start_time - word_start_time
if word_duration < 0:
word_duration = 0
@ -69,9 +70,11 @@ def words_from_metadata(metadata):
def metadata_json_output(metadata):
json_result = dict()
json_result["words"] = words_from_metadata(metadata)
json_result["confidence"] = metadata.confidence
return json.dumps(json_result)
json_result["transcripts"] = [{
"confidence": transcript.confidence,
"words": words_from_candidate_transcript(transcript),
} for transcript in metadata.transcripts]
return json.dumps(json_result, indent=2)
@ -141,9 +144,9 @@ def main():
print('Running inference.', file=sys.stderr)
inference_start = timer()
if args.extended:
print(metadata_to_string(ds.sttWithMetadata(audio)))
print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
elif args.json:
print(metadata_json_output(ds.sttWithMetadata(audio)))
print(metadata_json_output(ds.sttWithMetadata(audio, 3)))
else:
print(ds.stt(audio))
inference_end = timer() - inference_start

Просмотреть файл

@ -38,30 +38,69 @@ import_array();
%append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN));
}
%typemap(out) MetadataItem* %{
$result = PyList_New(arg1->num_items);
for (int i = 0; i < arg1->num_items; ++i) {
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->items[i]), SWIGTYPE_p_MetadataItem, 0);
%fragment("parent_reference_init", "init") {
// Thread-safe initialization - initialize during Python module initialization
parent_reference();
}
%fragment("parent_reference_function", "header", fragment="parent_reference_init") {
static PyObject *parent_reference() {
static PyObject *parent_reference_string = SWIG_Python_str_FromChar("__parent_reference");
return parent_reference_string;
}
}
%typemap(out, fragment="parent_reference_function") CandidateTranscript* %{
$result = PyList_New(arg1->num_transcripts);
for (int i = 0; i < arg1->num_transcripts; ++i) {
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->transcripts[i]), SWIGTYPE_p_CandidateTranscript, 0);
// Add a reference to Metadata in the returned elements to avoid premature
// garbage collection
PyObject_SetAttr(o, parent_reference(), $self);
PyList_SetItem($result, i, o);
}
%}
%extend struct MetadataItem {
%typemap(out, fragment="parent_reference_function") TokenMetadata* %{
$result = PyList_New(arg1->num_tokens);
for (int i = 0; i < arg1->num_tokens; ++i) {
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->tokens[i]), SWIGTYPE_p_TokenMetadata, 0);
// Add a reference to CandidateTranscript in the returned elements to avoid premature
// garbage collection
PyObject_SetAttr(o, parent_reference(), $self);
PyList_SetItem($result, i, o);
}
%}
%extend struct TokenMetadata {
%pythoncode %{
def __repr__(self):
return 'MetadataItem(character=\'{}\', timestep={}, start_time={})'.format(self.character, self.timestep, self.start_time)
return 'TokenMetadata(text=\'{}\', timestep={}, start_time={})'.format(self.text, self.timestep, self.start_time)
%}
}
%extend struct CandidateTranscript {
%pythoncode %{
def __repr__(self):
tokens_repr = ',\n'.join(repr(i) for i in self.tokens)
tokens_repr = '\n'.join(' ' + l for l in tokens_repr.split('\n'))
return 'CandidateTranscript(confidence={}, tokens=[\n{}\n])'.format(self.confidence, tokens_repr)
%}
}
%extend struct Metadata {
%pythoncode %{
def __repr__(self):
items_repr = ', \n'.join(' ' + repr(i) for i in self.items)
return 'Metadata(confidence={}, items=[\n{}\n])'.format(self.confidence, items_repr)
transcripts_repr = ',\n'.join(repr(i) for i in self.transcripts)
transcripts_repr = '\n'.join(' ' + l for l in transcripts_repr.split('\n'))
return 'Metadata(transcripts=[\n{}\n])'.format(transcripts_repr)
%}
}
%ignore Metadata::num_items;
%ignore Metadata::num_transcripts;
%ignore CandidateTranscript::num_tokens;
%extend struct Metadata {
~Metadata() {
@ -69,10 +108,12 @@ import_array();
}
}
%nodefaultdtor Metadata;
%nodefaultctor Metadata;
%nodefaultctor MetadataItem;
%nodefaultdtor MetadataItem;
%nodefaultdtor Metadata;
%nodefaultctor CandidateTranscript;
%nodefaultdtor CandidateTranscript;
%nodefaultctor TokenMetadata;
%nodefaultdtor TokenMetadata;
%typemap(newfree) char* "DS_FreeString($1);";