зеркало из https://github.com/mozilla/DSAlign.git
Cleaned up some things and fetching new en model for testing
This commit is contained in:
Родитель
5cfa401ff1
Коммит
3dc0fbb44c
|
@ -422,8 +422,8 @@ def main():
|
|||
if not exists(tlog_path):
|
||||
if output_graph_path is None:
|
||||
logging.debug('Looking for model files in "{}"...'.format(model_dir))
|
||||
output_graph_path = glob(model_dir + "/output_graph.pbmm")[0]
|
||||
lang_scorer_path = glob(model_dir + "/kenlm.scorer")[0]
|
||||
output_graph_path = glob(model_dir + "/*.pbmm")[0]
|
||||
lang_scorer_path = glob(model_dir + "/*.scorer")[0]
|
||||
kenlm_path = 'dependencies/kenlm/build/bin'
|
||||
if not path.exists(kenlm_path):
|
||||
kenlm_path = None
|
||||
|
|
|
@ -1,18 +1,13 @@
|
|||
import argparse
|
||||
import gzip
|
||||
import io
|
||||
import os
|
||||
import subprocess
|
||||
from collections import Counter
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def convert_and_filter_topk(output_dir, input_txt, top_k):
|
||||
""" Convert to lowercase, count word occurrences and save top-k words to a file """
|
||||
|
||||
counter = Counter()
|
||||
#data_lower = os.path.join(output_dir, "lower.txt.gz")
|
||||
data_lower = output_dir + "." + "lower.txt.gz"
|
||||
|
||||
print("\nConverting to lowercase and counting word occurrences ...")
|
||||
|
@ -29,7 +24,7 @@ def convert_and_filter_topk(output_dir, input_txt, top_k):
|
|||
else:
|
||||
file_in = open(input_txt, encoding="utf-8")
|
||||
|
||||
for line in tqdm(file_in):
|
||||
for line in file_in:
|
||||
line_lower = line.lower()
|
||||
counter.update(line_lower.split())
|
||||
file_out.write(line_lower)
|
||||
|
@ -41,7 +36,6 @@ def convert_and_filter_topk(output_dir, input_txt, top_k):
|
|||
top_counter = counter.most_common(top_k)
|
||||
vocab_str = "\n".join(word for word, count in top_counter)
|
||||
vocab_path = "vocab-{}.txt".format(top_k)
|
||||
#vocab_path = os.path.join(output_dir, vocab_path)
|
||||
vocab_path = output_dir + "." + vocab_path
|
||||
with open(vocab_path, "w+", encoding="utf-8") as file:
|
||||
file.write(vocab_str)
|
||||
|
@ -128,86 +122,4 @@ def build_lm(output_dir, kenlm_bins, arpa_order, max_arpa_memory, arpa_prune, di
|
|||
filtered_path,
|
||||
binary_path,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate lm.binary and top-k vocab for DeepSpeech."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input_txt",
|
||||
help="Path to a file.txt or file.txt.gz with sample sentences",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir", help="Directory path for the output", type=str, required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top_k",
|
||||
help="Use top_k most frequent words for the vocab.txt file. These will be used to filter the ARPA file.",
|
||||
type=int,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--kenlm_bins",
|
||||
help="File path to the KENLM binaries lmplz, filter and build_binary",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arpa_order",
|
||||
help="Order of k-grams in ARPA-file generation",
|
||||
type=int,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_arpa_memory",
|
||||
help="Maximum allowed memory usage for ARPA-file generation",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arpa_prune",
|
||||
help="ARPA pruning parameters. Separate values with '|'",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--binary_a_bits",
|
||||
help="Build binary quantization value a in bits",
|
||||
type=int,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--binary_q_bits",
|
||||
help="Build binary quantization value q in bits",
|
||||
type=int,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--binary_type",
|
||||
help="Build binary data structure type",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--discount_fallback",
|
||||
help="To try when such message is returned by kenlm: 'Could not calculate Kneser-Ney discounts [...] rerun with --discount_fallback'",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
data_lower, vocab_str = convert_and_filter_topk(args)
|
||||
build_lm(args, data_lower, vocab_str)
|
||||
|
||||
# Delete intermediate files
|
||||
os.remove(os.path.join(args.output_dir, "lower.txt.gz"))
|
||||
os.remove(os.path.join(args.output_dir, "lm.arpa"))
|
||||
os.remove(os.path.join(args.output_dir, "lm_filtered.arpa"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
)
|
|
@ -1,7 +1,3 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
|
@ -55,6 +51,7 @@ def create_bundle(
|
|||
scorer.set_utf8_mode(use_utf8)
|
||||
scorer.reset_params(default_alpha, default_beta)
|
||||
scorer.load_lm(lm_path)
|
||||
# TODO: Why is this not working?
|
||||
#err = scorer.load_lm(lm_path)
|
||||
#if err != ds_ctcdecoder.DS_ERR_SCORER_NO_TRIE:
|
||||
# print('Error loading language model file: 0x{:X}.'.format(err))
|
||||
|
@ -63,92 +60,4 @@ def create_bundle(
|
|||
scorer.fill_dictionary(list(words))
|
||||
shutil.copy(lm_path, package_path)
|
||||
scorer.save_dictionary(package_path, True) # append, not overwrite
|
||||
print("Package created in {}".format(package_path))
|
||||
|
||||
|
||||
class Tristate(object):
|
||||
def __init__(self, value=None):
|
||||
if any(value is v for v in (True, False, None)):
|
||||
self.value = value
|
||||
else:
|
||||
raise ValueError("Tristate value must be True, False, or None")
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
self.value is other.value
|
||||
if isinstance(other, Tristate)
|
||||
else self.value is other
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __bool__(self):
|
||||
raise TypeError("Tristate object may not be used as a Boolean")
|
||||
|
||||
def __str__(self):
|
||||
return str(self.value)
|
||||
|
||||
def __repr__(self):
|
||||
return "Tristate(%s)" % self.value
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate an external scorer package for DeepSpeech."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alphabet",
|
||||
help="Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using UTF-8 mode.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lm",
|
||||
required=True,
|
||||
help="Path of KenLM binary LM file. Must be built without including the vocabulary (use the -v flag). See generate_lm.py for how to create a binary LM.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vocab",
|
||||
required=True,
|
||||
help="Path of vocabulary file. Must contain words separated by whitespace.",
|
||||
)
|
||||
parser.add_argument("--package", required=True, help="Path to save scorer package.")
|
||||
parser.add_argument(
|
||||
"--default_alpha",
|
||||
type=float,
|
||||
required=True,
|
||||
help="Default value of alpha hyperparameter.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--default_beta",
|
||||
type=float,
|
||||
required=True,
|
||||
help="Default value of beta hyperparameter.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force_utf8",
|
||||
type=str,
|
||||
default="",
|
||||
help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode> for further explanation",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.force_utf8 in ("True", "1", "true", "yes", "y"):
|
||||
force_utf8 = Tristate(True)
|
||||
elif args.force_utf8 in ("False", "0", "false", "no", "n"):
|
||||
force_utf8 = Tristate(False)
|
||||
else:
|
||||
force_utf8 = Tristate(None)
|
||||
|
||||
create_bundle(
|
||||
args.alphabet,
|
||||
args.lm,
|
||||
args.vocab,
|
||||
args.package,
|
||||
force_utf8,
|
||||
args.default_alpha,
|
||||
args.default_beta,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
print("Package created in {}".format(package_path))
|
|
@ -1,17 +1,21 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
version="0.6.0"
|
||||
version="0.7.1"
|
||||
dir="deepspeech-${version}-models"
|
||||
archive="${dir}.tar.gz"
|
||||
am="${dir}.pbmm"
|
||||
scorer="${dir}.scorer"
|
||||
|
||||
mkdir -p models
|
||||
cd models
|
||||
if [[ ! -f $archive ]] ; then
|
||||
wget "https://github.com/mozilla/DeepSpeech/releases/download/v${version}/${archive}"
|
||||
mkdir -p models/en
|
||||
cd models/en
|
||||
|
||||
if [[ ! -f $am ]] ; then
|
||||
wget "https://github.com/mozilla/DeepSpeech/releases/download/v${version}/${am}"
|
||||
fi
|
||||
|
||||
tar -xzvf $archive
|
||||
mv $dir en
|
||||
if [[ ! -f $scorer ]] ; then
|
||||
wget "https://github.com/mozilla/DeepSpeech/releases/download/v${version}/${scorer}"
|
||||
fi
|
||||
|
||||
wget "https://raw.githubusercontent.com/mozilla/DeepSpeech/master/data/alphabet.txt"
|
||||
mv alphabet.txt en
|
||||
if [[ ! -f "alphabet.txt" ]] ; then
|
||||
wget "https://raw.githubusercontent.com/mozilla/DeepSpeech/master/data/alphabet.txt"
|
||||
fi
|
||||
|
|
Загрузка…
Ссылка в новой задаче