Cleaned up some things and fetching new en model for testing

2020-06-26 11:58:16 +02:00 · 2020-06-26 11:58:16 +02:00 · 3dc0fbb44c
--- a/align/align.py
+++ b/align/align.py
@ -422,8 +422,8 @@ def main():
        if not exists(tlog_path):
            if output_graph_path is None:
                logging.debug('Looking for model files in "{}"...'.format(model_dir))
-                output_graph_path = glob(model_dir + "/output_graph.pbmm")[0]
-                lang_scorer_path = glob(model_dir + "/kenlm.scorer")[0]
+                output_graph_path = glob(model_dir + "/*.pbmm")[0]
+                lang_scorer_path = glob(model_dir + "/*.scorer")[0]
            kenlm_path = 'dependencies/kenlm/build/bin'
            if not path.exists(kenlm_path):
                kenlm_path = None
--- a/align/generate_lm.py
+++ b/align/generate_lm.py
@ -1,18 +1,13 @@
-import argparse
 import gzip
 import io
 import os
 import subprocess
 from collections import Counter

-from tqdm import tqdm
-
-
 def convert_and_filter_topk(output_dir, input_txt, top_k):
    """ Convert to lowercase, count word occurrences and save top-k words to a file """

    counter = Counter()
-    #data_lower = os.path.join(output_dir, "lower.txt.gz")
    data_lower = output_dir + "." + "lower.txt.gz"

    print("\nConverting to lowercase and counting word occurrences ...")
@ -29,7 +24,7 @@ def convert_and_filter_topk(output_dir, input_txt, top_k):
        else:
            file_in = open(input_txt, encoding="utf-8")

-        for line in tqdm(file_in):
+        for line in file_in:
            line_lower = line.lower()
            counter.update(line_lower.split())
            file_out.write(line_lower)
@ -41,7 +36,6 @@ def convert_and_filter_topk(output_dir, input_txt, top_k):
    top_counter = counter.most_common(top_k)
    vocab_str = "\n".join(word for word, count in top_counter)
    vocab_path = "vocab-{}.txt".format(top_k)
-    #vocab_path = os.path.join(output_dir, vocab_path)
    vocab_path = output_dir + "." + vocab_path
    with open(vocab_path, "w+", encoding="utf-8") as file:
        file.write(vocab_str)
@ -128,86 +122,4 @@ def build_lm(output_dir, kenlm_bins, arpa_order, max_arpa_memory, arpa_prune, di
            filtered_path,
            binary_path,
        ]
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Generate lm.binary and top-k vocab for DeepSpeech."
-    )
-    parser.add_argument(
-        "--input_txt",
-        help="Path to a file.txt or file.txt.gz with sample sentences",
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "--output_dir", help="Directory path for the output", type=str, required=True
-    )
-    parser.add_argument(
-        "--top_k",
-        help="Use top_k most frequent words for the vocab.txt file. These will be used to filter the ARPA file.",
-        type=int,
-        required=True,
-    )
-    parser.add_argument(
-        "--kenlm_bins",
-        help="File path to the KENLM binaries lmplz, filter and build_binary",
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "--arpa_order",
-        help="Order of k-grams in ARPA-file generation",
-        type=int,
-        required=True,
-    )
-    parser.add_argument(
-        "--max_arpa_memory",
-        help="Maximum allowed memory usage for ARPA-file generation",
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "--arpa_prune",
-        help="ARPA pruning parameters. Separate values with '|'",
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "--binary_a_bits",
-        help="Build binary quantization value a in bits",
-        type=int,
-        required=True,
-    )
-    parser.add_argument(
-        "--binary_q_bits",
-        help="Build binary quantization value q in bits",
-        type=int,
-        required=True,
-    )
-    parser.add_argument(
-        "--binary_type",
-        help="Build binary data structure type",
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "--discount_fallback",
-        help="To try when such message is returned by kenlm: 'Could not calculate Kneser-Ney discounts [...] rerun with --discount_fallback'",
-        action="store_true",
-    )
-
-    args = parser.parse_args()
-
-    data_lower, vocab_str = convert_and_filter_topk(args)
-    build_lm(args, data_lower, vocab_str)
-
-    # Delete intermediate files
-    os.remove(os.path.join(args.output_dir, "lower.txt.gz"))
-    os.remove(os.path.join(args.output_dir, "lm.arpa"))
-    os.remove(os.path.join(args.output_dir, "lm_filtered.arpa"))
-
-
-if __name__ == "__main__":
-    main()
+    )
--- a/align/generate_package.py
+++ b/align/generate_package.py
@ -1,7 +1,3 @@
-#!/usr/bin/env python
-from __future__ import absolute_import, division, print_function
-
-import argparse
 import shutil
 import sys

@ -55,6 +51,7 @@ def create_bundle(
    scorer.set_utf8_mode(use_utf8)
    scorer.reset_params(default_alpha, default_beta)
    scorer.load_lm(lm_path)
+    # TODO: Why is this not working?
    #err = scorer.load_lm(lm_path)
    #if err != ds_ctcdecoder.DS_ERR_SCORER_NO_TRIE:
    #    print('Error loading language model file: 0x{:X}.'.format(err))
@ -63,92 +60,4 @@ def create_bundle(
    scorer.fill_dictionary(list(words))
    shutil.copy(lm_path, package_path)
    scorer.save_dictionary(package_path, True)  # append, not overwrite
-    print("Package created in {}".format(package_path))
-
-
-class Tristate(object):
-    def __init__(self, value=None):
-        if any(value is v for v in (True, False, None)):
-            self.value = value
-        else:
-            raise ValueError("Tristate value must be True, False, or None")
-
-    def __eq__(self, other):
-        return (
-            self.value is other.value
-            if isinstance(other, Tristate)
-            else self.value is other
-        )
-
-    def __ne__(self, other):
-        return not self == other
-
-    def __bool__(self):
-        raise TypeError("Tristate object may not be used as a Boolean")
-
-    def __str__(self):
-        return str(self.value)
-
-    def __repr__(self):
-        return "Tristate(%s)" % self.value
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Generate an external scorer package for DeepSpeech."
-    )
-    parser.add_argument(
-        "--alphabet",
-        help="Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using UTF-8 mode.",
-    )
-    parser.add_argument(
-        "--lm",
-        required=True,
-        help="Path of KenLM binary LM file. Must be built without including the vocabulary (use the -v flag). See generate_lm.py for how to create a binary LM.",
-    )
-    parser.add_argument(
-        "--vocab",
-        required=True,
-        help="Path of vocabulary file. Must contain words separated by whitespace.",
-    )
-    parser.add_argument("--package", required=True, help="Path to save scorer package.")
-    parser.add_argument(
-        "--default_alpha",
-        type=float,
-        required=True,
-        help="Default value of alpha hyperparameter.",
-    )
-    parser.add_argument(
-        "--default_beta",
-        type=float,
-        required=True,
-        help="Default value of beta hyperparameter.",
-    )
-    parser.add_argument(
-        "--force_utf8",
-        type=str,
-        default="",
-        help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode> for further explanation",
-    )
-    args = parser.parse_args()
-
-    if args.force_utf8 in ("True", "1", "true", "yes", "y"):
-        force_utf8 = Tristate(True)
-    elif args.force_utf8 in ("False", "0", "false", "no", "n"):
-        force_utf8 = Tristate(False)
-    else:
-        force_utf8 = Tristate(None)
-
-    create_bundle(
-        args.alphabet,
-        args.lm,
-        args.vocab,
-        args.package,
-        force_utf8,
-        args.default_alpha,
-        args.default_beta,
-    )
-
-
-if __name__ == "__main__":
-    main()
+    print("Package created in {}".format(package_path))
--- a/bin/getmodel.sh
+++ b/bin/getmodel.sh
@ -1,17 +1,21 @@
 #!/usr/bin/env bash

-version="0.6.0"
+version="0.7.1"
 dir="deepspeech-${version}-models"
-archive="${dir}.tar.gz"
+am="${dir}.pbmm"
+scorer="${dir}.scorer"

-mkdir -p models
-cd models
-if [[ ! -f $archive ]] ; then
-    wget "https://github.com/mozilla/DeepSpeech/releases/download/v${version}/${archive}"
+mkdir -p models/en
+cd models/en
+
+if [[ ! -f $am ]] ; then
+    wget "https://github.com/mozilla/DeepSpeech/releases/download/v${version}/${am}"
 fi

-tar -xzvf $archive
-mv $dir en
+if [[ ! -f $scorer ]] ; then
+    wget "https://github.com/mozilla/DeepSpeech/releases/download/v${version}/${scorer}"
+fi

-wget "https://raw.githubusercontent.com/mozilla/DeepSpeech/master/data/alphabet.txt"
-mv alphabet.txt en
+if [[ ! -f "alphabet.txt" ]] ; then
+    wget "https://raw.githubusercontent.com/mozilla/DeepSpeech/master/data/alphabet.txt"
+fi