diff --git a/.compute b/.compute new file mode 100644 index 0000000..96e449c --- /dev/null +++ b/.compute @@ -0,0 +1,11 @@ +#!/bin/bash + +set -xe + +apt-get install -y python3-venv cmake libboost-all-dev libeigen3-dev + +export SW_DIR="/root" +export MODELS_DIR="${ML_GROUP_DIR}/language-models" +mkdir -p "${MODELS_DIR}" + +bin/genlm --alphabet-mode utf8 en diff --git a/.idea/vcs.xml b/.idea/vcs.xml index 94a25f7..3e178b4 100644 --- a/.idea/vcs.xml +++ b/.idea/vcs.xml @@ -2,5 +2,6 @@ + \ No newline at end of file diff --git a/bin/prepare b/bin/prepare index ef8101f..b92a596 100755 --- a/bin/prepare +++ b/bin/prepare @@ -1,5 +1,7 @@ #!/usr/bin/env bash +SW_DIR="${SW_DIR:-dependencies}" + if [ ! -d venv ]; then python3 -m venv venv source venv/bin/activate @@ -8,9 +10,9 @@ if [ ! -d venv ]; then printf "\n ****** Installed Python packages ****** \n\n\n\n" fi -if [ ! -f dependencies/kenlm/build/bin/lmplz ]; then - mkdir -p dependencies/kenlm - pushd dependencies +if [ ! -f "${SW_DIR}/kenlm/build/bin/lmplz" ]; then + mkdir -p "${SW_DIR}/kenlm" + pushd "${SW_DIR}" git clone https://github.com/kpu/kenlm.git pushd kenlm @@ -27,10 +29,10 @@ if [ ! -f dependencies/kenlm/build/bin/lmplz ]; then printf "\n ****** Installed KenLM ****** \n\n\n\n" fi -if [ ! -f dependencies/deepspeech/libdeepspeech.so ]; then +if [ ! -f "${SW_DIR}/deepspeech/libdeepspeech.so" ]; then source venv/bin/activate - mkdir -p dependencies/deepspeech - python oscarlm/taskcluster.py --target dependencies/deepspeech --branch v0.6.0 + mkdir -p "${SW_DIR}/deepspeech" + python oscarlm/taskcluster.py --target "${SW_DIR}/deepspeech" --branch v0.6.0 printf "\n ****** Installed DeepSpeech tools ****** \n\n\n\n" fi diff --git a/oscarlm/genlm.py b/oscarlm/genlm.py index 7c437d0..7bd9124 100644 --- a/oscarlm/genlm.py +++ b/oscarlm/genlm.py @@ -16,8 +16,9 @@ from utils import maybe_download, maybe_ungzip, maybe_join, section, log_progres STOP_TOKEN = False MAX_KEYS = 100000 -KENLM_BIN = 'dependencies/kenlm/build/bin' -DEEPSPEECH_BIN = 'dependencies/deepspeech' +SW_DIR = os.getenv('SW_DIR', 'dependencies') +KENLM_BIN = SW_DIR + '/kenlm/build/bin' +DEEPSPEECH_BIN = SW_DIR + '/deepspeech' def get_partial_path(index): @@ -226,8 +227,6 @@ def parse_args(): help='language of the model to generate') parser.add_argument('--workers', type=int, default=os.cpu_count(), help='number of preparation and counting workers') - parser.add_argument('--simulate', action='store_true', - help='simulate language model generation with small amount of input data') parser.add_argument('--prune-factor', type=int, default=10, help='times --vocabulary-size of items to keep in each vocabulary aggregator') parser.add_argument('--vocabulary-size', type=int, default=500000, diff --git a/oscarlm/languages/__init__.py b/oscarlm/languages/__init__.py index 28cb134..9a2816a 100644 --- a/oscarlm/languages/__init__.py +++ b/oscarlm/languages/__init__.py @@ -13,7 +13,7 @@ def code_from_filename(filename): FILE_DIR = os.path.dirname(__file__) LANGUAGE_CODES = list(map(code_from_filename, glob(FILE_DIR + '/[!_]*.py'))) BASE_DIR = os.path.dirname(os.path.dirname(FILE_DIR)) -MODELS_DIR = os.path.join(BASE_DIR, 'models') +MODELS_DIR = os.getenv('MODELS_DIR', os.path.join(BASE_DIR, 'models')) class LanguageBase: