DeepSpeech/Dockerfile.train.tmpl

# Please refer to the TRAINING documentation, "Basic Dockerfile for training"

FROM tensorflow/tensorflow:1.15.2-gpu-py3
ENV DEBIAN_FRONTEND=noninteractive

ENV MOZILLA_VOICE_STT_REPO=#MOZILLA_VOICE_STT_REPO#
ENV MOZILLA_VOICE_STT_SHA=#MOZILLA_VOICE_STT_SHA#

RUN apt-get update && apt-get install -y --no-install-recommends \
        apt-utils \
        bash-completion \
        build-essential \
        cmake \
        curl \
        git \
        libboost-all-dev \
        libbz2-dev \
        locales \
        python3-venv \
        unzip \
        wget

# We need to remove it because it's breaking deepspeech install later with
# weird errors about setuptools
RUN apt-get purge -y python3-xdg

# Install dependencies for audio augmentation
RUN apt-get install -y --no-install-recommends libopus0 libsndfile1

# Try and free some space
RUN rm -rf /var/lib/apt/lists/*

WORKDIR /
RUN git clone $MOZILLA_VOICE_STT_REPO

WORKDIR /DeepSpeech
RUN git checkout $MOZILLA_VOICE_STT_SHA

# Build CTC decoder first, to avoid clashes on incompatible versions upgrades
RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl

# Prepare deps
RUN pip3 install --upgrade pip==20.0.2 wheel==0.34.2 setuptools==46.1.3

# Install DeepSpeech
#  - No need for the decoder since we did it earlier
#  - There is already correct TensorFlow GPU installed on the base image,
#    we don't want to break that
RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .

# Tool to convert output graph for inference
RUN python3 util/taskcluster.py --source tensorflow --branch r1.15 \
        --artifact convert_graphdef_memmapped_format  --target .

# Build KenLM to generate new scorers
WORKDIR /DeepSpeech/native_client
RUN rm -rf kenlm && \
	git clone https://github.com/kpu/kenlm && \
	cd kenlm && \
	git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \
	mkdir -p build && \
	cd build && \
	cmake .. && \
	make -j $(nproc)
WORKDIR /DeepSpeech

RUN ./bin/run-ldc93s1.sh
Decouple Dockerfile into build and train 2020-06-02 22:23:20 +03:00			`# Please refer to the TRAINING documentation, "Basic Dockerfile for training"`

			`FROM tensorflow/tensorflow:1.15.2-gpu-py3`
Fix usage of ARG instead of ENV in Dockerfile.train 2020-06-18 16:20:46 +03:00			`ENV DEBIAN_FRONTEND=noninteractive`
Decouple Dockerfile into build and train 2020-06-02 22:23:20 +03:00
Use new name for Docker container and Docker Hub repo 2020-08-10 21:19:50 +03:00			`ENV MOZILLA_VOICE_STT_REPO=#MOZILLA_VOICE_STT_REPO#`
			`ENV MOZILLA_VOICE_STT_SHA=#MOZILLA_VOICE_STT_SHA#`
Decouple Dockerfile into build and train 2020-06-02 22:23:20 +03:00
			`RUN apt-get update && apt-get install -y --no-install-recommends \`
			`apt-utils \`
			`bash-completion \`
			`build-essential \`
Build kenlm in training container image. 2020-06-26 16:07:18 +03:00			`cmake \`
Decouple Dockerfile into build and train 2020-06-02 22:23:20 +03:00			`curl \`
			`git \`
Build kenlm in training container image. 2020-06-26 16:07:18 +03:00			`libboost-all-dev \`
Decouple Dockerfile into build and train 2020-06-02 22:23:20 +03:00			`libbz2-dev \`
			`locales \`
			`python3-venv \`
			`unzip \`
			`wget`

Fix #3071: Don't reinstall TensorFlow on top of TensorFlow 2020-06-16 23:14:24 +03:00			`# We need to remove it because it's breaking deepspeech install later with`
			`# weird errors about setuptools`
			`RUN apt-get purge -y python3-xdg`

Add dependencies for new audio augmentation flags. Fixes #3082. 2020-06-18 13:24:56 +03:00			`# Install dependencies for audio augmentation`
			`RUN apt-get install -y --no-install-recommends libopus0 libsndfile1`

Fix #3157: Add CircleCI config 2020-07-15 14:46:13 +03:00			`# Try and free some space`
			`RUN rm -rf /var/lib/apt/lists/*`

Decouple Dockerfile into build and train 2020-06-02 22:23:20 +03:00			`WORKDIR /`
Use new name for Docker container and Docker Hub repo 2020-08-10 21:19:50 +03:00			`RUN git clone $MOZILLA_VOICE_STT_REPO`
Decouple Dockerfile into build and train 2020-06-02 22:23:20 +03:00
Revert "Merge pull request #3246 from lissyx/fix-docker" This reverts commit c01fda56c058779cc9dba952ce940c47398c4ed3, reversing changes made to 3e99b0d8b2b2d6e47c8ff7eb1dfd9a88eba8e6d8. 2020-08-25 16:35:03 +03:00			`WORKDIR /DeepSpeech`
Use new name for Docker container and Docker Hub repo 2020-08-10 21:19:50 +03:00			`RUN git checkout $MOZILLA_VOICE_STT_SHA`
Decouple Dockerfile into build and train 2020-06-02 22:23:20 +03:00
			`# Build CTC decoder first, to avoid clashes on incompatible versions upgrades`
			`RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings`
			`RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl`

			`# Prepare deps`
			`RUN pip3 install --upgrade pip==20.0.2 wheel==0.34.2 setuptools==46.1.3`

Revert "Merge pull request #3246 from lissyx/fix-docker" This reverts commit c01fda56c058779cc9dba952ce940c47398c4ed3, reversing changes made to 3e99b0d8b2b2d6e47c8ff7eb1dfd9a88eba8e6d8. 2020-08-25 16:35:03 +03:00			`# Install DeepSpeech`
Fix #3071: Don't reinstall TensorFlow on top of TensorFlow 2020-06-16 23:14:24 +03:00			`# - No need for the decoder since we did it earlier`
			`# - There is already correct TensorFlow GPU installed on the base image,`
			`# we don't want to break that`
Remove --force-reinstall from training code install No longer needed since we started publishing ds_ctcdecode on PyPI. 2020-06-17 16:26:31 +03:00			`RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .`
Decouple Dockerfile into build and train 2020-06-02 22:23:20 +03:00
Install checkpoint converting tool. 2020-06-18 16:20:28 +03:00			`# Tool to convert output graph for inference`
Make paths relative. 2020-06-18 18:26:38 +03:00			`RUN python3 util/taskcluster.py --source tensorflow --branch r1.15 \`
			`--artifact convert_graphdef_memmapped_format --target .`
Install checkpoint converting tool. 2020-06-18 16:20:28 +03:00
Build kenlm in training container image. 2020-06-26 16:07:18 +03:00			`# Build KenLM to generate new scorers`
Revert "Merge pull request #3246 from lissyx/fix-docker" This reverts commit c01fda56c058779cc9dba952ce940c47398c4ed3, reversing changes made to 3e99b0d8b2b2d6e47c8ff7eb1dfd9a88eba8e6d8. 2020-08-25 16:35:03 +03:00			`WORKDIR /DeepSpeech/native_client`
Build kenlm in training container image. 2020-06-26 16:07:18 +03:00			`RUN rm -rf kenlm && \`
			`git clone https://github.com/kpu/kenlm && \`
			`cd kenlm && \`
			`git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \`
			`mkdir -p build && \`
			`cd build && \`
			`cmake .. && \`
			`make -j $(nproc)`
Revert "Merge pull request #3246 from lissyx/fix-docker" This reverts commit c01fda56c058779cc9dba952ce940c47398c4ed3, reversing changes made to 3e99b0d8b2b2d6e47c8ff7eb1dfd9a88eba8e6d8. 2020-08-25 16:35:03 +03:00			`WORKDIR /DeepSpeech`
Build kenlm in training container image. 2020-06-26 16:07:18 +03:00
Decouple Dockerfile into build and train 2020-06-02 22:23:20 +03:00			`RUN ./bin/run-ldc93s1.sh`