Cleaned up some things and fetching new en model for testing

This commit is contained in:
Bias 2020-06-26 11:58:16 +02:00 коммит произвёл Tilman Kamp
Родитель 5cfa401ff1
Коммит 3dc0fbb44c
4 изменённых файлов: 20 добавлений и 195 удалений

Просмотреть файл

@ -422,8 +422,8 @@ def main():
if not exists(tlog_path):
if output_graph_path is None:
logging.debug('Looking for model files in "{}"...'.format(model_dir))
output_graph_path = glob(model_dir + "/output_graph.pbmm")[0]
lang_scorer_path = glob(model_dir + "/kenlm.scorer")[0]
output_graph_path = glob(model_dir + "/*.pbmm")[0]
lang_scorer_path = glob(model_dir + "/*.scorer")[0]
kenlm_path = 'dependencies/kenlm/build/bin'
if not path.exists(kenlm_path):
kenlm_path = None

Просмотреть файл

@ -1,18 +1,13 @@
import argparse
import gzip
import io
import os
import subprocess
from collections import Counter
from tqdm import tqdm
def convert_and_filter_topk(output_dir, input_txt, top_k):
""" Convert to lowercase, count word occurrences and save top-k words to a file """
counter = Counter()
#data_lower = os.path.join(output_dir, "lower.txt.gz")
data_lower = output_dir + "." + "lower.txt.gz"
print("\nConverting to lowercase and counting word occurrences ...")
@ -29,7 +24,7 @@ def convert_and_filter_topk(output_dir, input_txt, top_k):
else:
file_in = open(input_txt, encoding="utf-8")
for line in tqdm(file_in):
for line in file_in:
line_lower = line.lower()
counter.update(line_lower.split())
file_out.write(line_lower)
@ -41,7 +36,6 @@ def convert_and_filter_topk(output_dir, input_txt, top_k):
top_counter = counter.most_common(top_k)
vocab_str = "\n".join(word for word, count in top_counter)
vocab_path = "vocab-{}.txt".format(top_k)
#vocab_path = os.path.join(output_dir, vocab_path)
vocab_path = output_dir + "." + vocab_path
with open(vocab_path, "w+", encoding="utf-8") as file:
file.write(vocab_str)
@ -128,86 +122,4 @@ def build_lm(output_dir, kenlm_bins, arpa_order, max_arpa_memory, arpa_prune, di
filtered_path,
binary_path,
]
)
def main():
parser = argparse.ArgumentParser(
description="Generate lm.binary and top-k vocab for DeepSpeech."
)
parser.add_argument(
"--input_txt",
help="Path to a file.txt or file.txt.gz with sample sentences",
type=str,
required=True,
)
parser.add_argument(
"--output_dir", help="Directory path for the output", type=str, required=True
)
parser.add_argument(
"--top_k",
help="Use top_k most frequent words for the vocab.txt file. These will be used to filter the ARPA file.",
type=int,
required=True,
)
parser.add_argument(
"--kenlm_bins",
help="File path to the KENLM binaries lmplz, filter and build_binary",
type=str,
required=True,
)
parser.add_argument(
"--arpa_order",
help="Order of k-grams in ARPA-file generation",
type=int,
required=True,
)
parser.add_argument(
"--max_arpa_memory",
help="Maximum allowed memory usage for ARPA-file generation",
type=str,
required=True,
)
parser.add_argument(
"--arpa_prune",
help="ARPA pruning parameters. Separate values with '|'",
type=str,
required=True,
)
parser.add_argument(
"--binary_a_bits",
help="Build binary quantization value a in bits",
type=int,
required=True,
)
parser.add_argument(
"--binary_q_bits",
help="Build binary quantization value q in bits",
type=int,
required=True,
)
parser.add_argument(
"--binary_type",
help="Build binary data structure type",
type=str,
required=True,
)
parser.add_argument(
"--discount_fallback",
help="To try when such message is returned by kenlm: 'Could not calculate Kneser-Ney discounts [...] rerun with --discount_fallback'",
action="store_true",
)
args = parser.parse_args()
data_lower, vocab_str = convert_and_filter_topk(args)
build_lm(args, data_lower, vocab_str)
# Delete intermediate files
os.remove(os.path.join(args.output_dir, "lower.txt.gz"))
os.remove(os.path.join(args.output_dir, "lm.arpa"))
os.remove(os.path.join(args.output_dir, "lm_filtered.arpa"))
if __name__ == "__main__":
main()
)

Просмотреть файл

@ -1,7 +1,3 @@
#!/usr/bin/env python
from __future__ import absolute_import, division, print_function
import argparse
import shutil
import sys
@ -55,6 +51,7 @@ def create_bundle(
scorer.set_utf8_mode(use_utf8)
scorer.reset_params(default_alpha, default_beta)
scorer.load_lm(lm_path)
# TODO: Why is this not working?
#err = scorer.load_lm(lm_path)
#if err != ds_ctcdecoder.DS_ERR_SCORER_NO_TRIE:
# print('Error loading language model file: 0x{:X}.'.format(err))
@ -63,92 +60,4 @@ def create_bundle(
scorer.fill_dictionary(list(words))
shutil.copy(lm_path, package_path)
scorer.save_dictionary(package_path, True) # append, not overwrite
print("Package created in {}".format(package_path))
class Tristate(object):
def __init__(self, value=None):
if any(value is v for v in (True, False, None)):
self.value = value
else:
raise ValueError("Tristate value must be True, False, or None")
def __eq__(self, other):
return (
self.value is other.value
if isinstance(other, Tristate)
else self.value is other
)
def __ne__(self, other):
return not self == other
def __bool__(self):
raise TypeError("Tristate object may not be used as a Boolean")
def __str__(self):
return str(self.value)
def __repr__(self):
return "Tristate(%s)" % self.value
def main():
parser = argparse.ArgumentParser(
description="Generate an external scorer package for DeepSpeech."
)
parser.add_argument(
"--alphabet",
help="Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using UTF-8 mode.",
)
parser.add_argument(
"--lm",
required=True,
help="Path of KenLM binary LM file. Must be built without including the vocabulary (use the -v flag). See generate_lm.py for how to create a binary LM.",
)
parser.add_argument(
"--vocab",
required=True,
help="Path of vocabulary file. Must contain words separated by whitespace.",
)
parser.add_argument("--package", required=True, help="Path to save scorer package.")
parser.add_argument(
"--default_alpha",
type=float,
required=True,
help="Default value of alpha hyperparameter.",
)
parser.add_argument(
"--default_beta",
type=float,
required=True,
help="Default value of beta hyperparameter.",
)
parser.add_argument(
"--force_utf8",
type=str,
default="",
help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode> for further explanation",
)
args = parser.parse_args()
if args.force_utf8 in ("True", "1", "true", "yes", "y"):
force_utf8 = Tristate(True)
elif args.force_utf8 in ("False", "0", "false", "no", "n"):
force_utf8 = Tristate(False)
else:
force_utf8 = Tristate(None)
create_bundle(
args.alphabet,
args.lm,
args.vocab,
args.package,
force_utf8,
args.default_alpha,
args.default_beta,
)
if __name__ == "__main__":
main()
print("Package created in {}".format(package_path))

Просмотреть файл

@ -1,17 +1,21 @@
#!/usr/bin/env bash
version="0.6.0"
version="0.7.1"
dir="deepspeech-${version}-models"
archive="${dir}.tar.gz"
am="${dir}.pbmm"
scorer="${dir}.scorer"
mkdir -p models
cd models
if [[ ! -f $archive ]] ; then
wget "https://github.com/mozilla/DeepSpeech/releases/download/v${version}/${archive}"
mkdir -p models/en
cd models/en
if [[ ! -f $am ]] ; then
wget "https://github.com/mozilla/DeepSpeech/releases/download/v${version}/${am}"
fi
tar -xzvf $archive
mv $dir en
if [[ ! -f $scorer ]] ; then
wget "https://github.com/mozilla/DeepSpeech/releases/download/v${version}/${scorer}"
fi
wget "https://raw.githubusercontent.com/mozilla/DeepSpeech/master/data/alphabet.txt"
mv alphabet.txt en
if [[ ! -f "alphabet.txt" ]] ; then
wget "https://raw.githubusercontent.com/mozilla/DeepSpeech/master/data/alphabet.txt"
fi