Configurable directories for dependencies and models; .compute file

2020-03-13 19:27:50 +01:00 · 2020-03-13 19:27:50 +01:00 · a2229e9c8b
--- a/.compute
+++ b/.compute
@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -xe
+
+apt-get install -y python3-venv cmake libboost-all-dev libeigen3-dev
+
+export SW_DIR="/root"
+export MODELS_DIR="${ML_GROUP_DIR}/language-models"
+mkdir -p "${MODELS_DIR}"
+
+bin/genlm --alphabet-mode utf8 en
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -2,5 +2,6 @@
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/test/kenlm" vcs="Git" />
  </component>
 </project>
--- a/bin/prepare
+++ b/bin/prepare
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+SW_DIR="${SW_DIR:-dependencies}"
+
 if [ ! -d venv ]; then
  python3 -m venv venv
  source venv/bin/activate
@ -8,9 +10,9 @@ if [ ! -d venv ]; then
  printf "\n ****** Installed Python packages ****** \n\n\n\n"
 fi

-if [ ! -f dependencies/kenlm/build/bin/lmplz ]; then
-  mkdir -p dependencies/kenlm
-  pushd dependencies
+if [ ! -f "${SW_DIR}/kenlm/build/bin/lmplz" ]; then
+  mkdir -p "${SW_DIR}/kenlm"
+  pushd "${SW_DIR}"

  git clone https://github.com/kpu/kenlm.git
  pushd kenlm
@ -27,10 +29,10 @@ if [ ! -f dependencies/kenlm/build/bin/lmplz ]; then
  printf "\n ****** Installed KenLM ****** \n\n\n\n"
 fi

-if [ ! -f dependencies/deepspeech/libdeepspeech.so ]; then
+if [ ! -f "${SW_DIR}/deepspeech/libdeepspeech.so" ]; then
  source venv/bin/activate
-  mkdir -p dependencies/deepspeech
-  python oscarlm/taskcluster.py --target dependencies/deepspeech --branch v0.6.0
+  mkdir -p "${SW_DIR}/deepspeech"
+  python oscarlm/taskcluster.py --target "${SW_DIR}/deepspeech" --branch v0.6.0
  printf "\n ****** Installed DeepSpeech tools ****** \n\n\n\n"
 fi

--- a/oscarlm/genlm.py
+++ b/oscarlm/genlm.py
@ -16,8 +16,9 @@ from utils import maybe_download, maybe_ungzip, maybe_join, section, log_progres
 STOP_TOKEN = False
 MAX_KEYS = 100000

-KENLM_BIN = 'dependencies/kenlm/build/bin'
-DEEPSPEECH_BIN = 'dependencies/deepspeech'
+SW_DIR = os.getenv('SW_DIR', 'dependencies')
+KENLM_BIN = SW_DIR + '/kenlm/build/bin'
+DEEPSPEECH_BIN = SW_DIR + '/deepspeech'


 def get_partial_path(index):
@ -226,8 +227,6 @@ def parse_args():
                        help='language of the model to generate')
    parser.add_argument('--workers', type=int, default=os.cpu_count(),
                        help='number of preparation and counting workers')
-    parser.add_argument('--simulate', action='store_true',
-                        help='simulate language model generation with small amount of input data')
    parser.add_argument('--prune-factor', type=int, default=10,
                        help='times --vocabulary-size of items to keep in each vocabulary aggregator')
    parser.add_argument('--vocabulary-size', type=int, default=500000,
--- a/oscarlm/languages/init.py
+++ b/oscarlm/languages/init.py
@ -13,7 +13,7 @@ def code_from_filename(filename):
 FILE_DIR = os.path.dirname(__file__)
 LANGUAGE_CODES = list(map(code_from_filename, glob(FILE_DIR + '/[!_]*.py')))
 BASE_DIR = os.path.dirname(os.path.dirname(FILE_DIR))
-MODELS_DIR = os.path.join(BASE_DIR, 'models')
+MODELS_DIR = os.getenv('MODELS_DIR', os.path.join(BASE_DIR, 'models'))


 class LanguageBase: