firefox-translations-training/Taskfile.yml

version: '3'

tasks:
  poetry-install-*:
    internal: true
    desc: Install only the group need for the dependencies of a task.
    vars:
      GROUP: '{{index .MATCH 0}}'
    cmds:
      - poetry install --only {{.GROUP}} --no-root

  clean-venvs:
    desc: Remove the virtual envs created by the test runner.
    cmds:
      - rm -rf data/task-venvs/*

  download-logs:
    desc: Download the logs for taskcluster. Requires --task-group-id
    summary: |
      The logs will be saved to: ./data/taskcluster-logs

      Example:
        task download-logs -- --task-group-id GU9ZyWFhRDe_nxlAHcen8g
    deps: [poetry-install-utils]
    cmds:
      - >-
          poetry run python -W ignore utils/taskcluster_downloader.py
          --mode=logs {{.CLI_ARGS}}

  download-evals:
    desc: Downloads evaluation results from Taskcluster
    summary: |
      The evals will be saved to: ./data/taskcluster-evals
      Example: `task download-evals -- --task-group-id GU9ZyWFhRDe_nxlAHcen8g`
    deps: [poetry-install-utils]
    cmds:
      - >-
          poetry run python -W ignore utils/taskcluster_downloader.py
          --mode=evals {{.CLI_ARGS}}

  download-models:
    desc: Downloads models from Taskcluster
    summary: |
      The models will be saved to: ./data/taskcluster-model
      Example: `task download-models -- --task-group-id GU9ZyWFhRDe_nxlAHcen8g`
    deps: [poetry-install-utils]
    cmds:
      - >-
          poetry run python -W ignore utils/taskcluster_downloader.py
          --mode=model {{.CLI_ARGS}}

  config-generator:
    desc: Create a training config for a language pair
    summary: |
      The models will be saved to: ./data/taskcluster-model
      Example: `task config-generator -- en fi`
    deps: [poetry-install-utils]
    cmds:
      - >-
          PYTHONPATH=$(pwd) poetry run python -W ignore utils/config_generator.py {{.CLI_ARGS}}

  build-mono-nllb:
    desc: Build a monolingual NLLB datasets.
    summary: |
      The dataset will be saved to: ./data/nllb/nllb-mono-{lang}.txt.gz
      Example: `task build-mono-nllb -- sl`
    deps: [poetry-install-utils]
    cmds:
      - >-
          PYTHONPATH=$(pwd) poetry run python -W ignore utils/build-mono-nllb.py {{.CLI_ARGS}}

  opuscleaner:
    desc: Run the opuscleaner tool.
    deps: [poetry-install-opuscleaner]
    cmds:
      - poetry run opuscleaner-server serve --host=0.0.0.0 --port=8000

  inference-clean:
    desc: Clean build artifacts from the inference directory.
    cmds:
      - >-
          task docker-run -- ./inference/scripts/clean.sh

  inference-build:
    desc: Build inference engine.
    cmds:
      - >-
          task docker-run -- ./inference/scripts/build-local.sh

  inference-test:
    desc: Run inference tests.
    cmds:
      - >-
          task docker-run -- ./inference/scripts/unit-tests.sh

  inference-build-wasm:
    desc: Build inference engine WASM.
    cmds:
      - >-
          task docker-run -- ./inference/scripts/build-wasm.sh

  lint-black:
    desc: Checks the styling of the Python code with Black.
    deps: [poetry-install-black]
    cmds:
      - ./utils/tasks/black-check.sh

  lint-black-fix:
    desc: Fixes the styling of the Python code with Black.
    deps: [poetry-install-black]
    cmds:
      - poetry run black . {{.CLI_ARGS}}

  lint-ruff:
    desc: Lints the Python code with the ruff linter.
    deps: [poetry-install-lint]
    cmds:
      - poetry run ruff --version
      - poetry run ruff check . {{.CLI_ARGS}}

  lint-ruff-fix:
    desc: Fixes Python code lint errors with the ruff linter.
    deps: [poetry-install-lint]
    cmds:
      - poetry run ruff --version
      - poetry run ruff check . --fix {{.CLI_ARGS}}

  lint-fix:
    desc: Fix all automatically fixable errors. This is useful to run before pushing.
    cmds:
      - task: lint-black-fix
      - task: lint-ruff-fix

  lint:
    desc: Run all available linting tools.
    cmds:
      - task: lint-black
      - task: lint-ruff

  test:
    desc: Run python pytests in the current host.
    summary: |
      Some tests only pass in Docker. You can run this command outside of docker for
      some of the tests, or after running `task docker` to run them in the docker image.
      Without any arguments, it runs all of the tests searching the paths specifiied in
      testpaths in pyproject.toml.

      You can also specificy a specific test to run:

        task test -- tests/test_alignments.py
    cmds:
      - poetry install --only tests --only utils --no-root
      - PYTHONPATH="$(pwd):$(pwd)/taskcluster/scripts/pipeline" poetry run pytest -vv {{.CLI_ARGS}}

  test-fast:
    desc: Re-run tests in a faster configuration.
    summary: |
      This command skips taskgraph generation and skips the poetry install in order to
      re-run tests quickly. If the taskgraph or dependencies are out of date, then tests
      may incorrectly fail. It also outputs the captured stdout.

        task test-fast -- tests/test_alignments.py
    cmds:
      - >-
        SKIP_TASKGRAPH=1 PYTHONPATH="$(pwd):$(pwd)/taskcluster/scripts/pipeline"
        poetry run pytest -vv -s {{.CLI_ARGS}}

  test-docker:
    desc: Run the unit tests in the docker image. Some tests require the pre-built Linux executables.
    cmds:
      - task docker-run -- task test

  train:
    desc: Start a training run
    summary: Open up the train task from the CLI based on your current branch.
    deps: [poetry-install-utils, poetry-install-taskcluster]
    cmds:
      - >-
          poetry run python -W ignore utils/train.py {{.CLI_ARGS}}

  docker:
    desc: Interactively run the local docker test image.
    deps: [docker-build]
    summary: |
      The local docker image includes the Linux x86 image, and pre-built binaries
      that are used in training.
    cmds:
      - utils/tasks/docker-run.sh bash

  docker-run:
    desc: Run a command in the local docker instance. e.g. `docker-run -- echo "hello"`
    deps: [docker-build]
    summary: |
      The local docker image includes the Linux x86 image, and pre-built binaries
      that are used in training.
    cmds:
      - utils/tasks/docker-run.sh {{.CLI_ARGS}}

  docker-build:
    desc: Build the local docker image that includes the proper Linux binaries for training
    cmds:
      - ./utils/tasks/docker-build.sh

  taskgraph-requirements:
    desc: Installs the taskgraph requirements.
    internal: true
    cmds:
      - poetry run --directory ./taskgraph -- pip3 install -r taskcluster/requirements.txt

  taskgraph-validate:
    desc: Validates Taskcluster task graph locally
    deps: [taskgraph-requirements]
    cmds:
      - >-
        TASKCLUSTER_ROOT_URL=""
        poetry run --directory ./taskgraph --
        taskgraph full

  taskgraph-diff:
    desc: Validates Taskcluster task graph locally
    summary: |
      Generates diffs of the full taskgraph against BASE_REV. Any parameters that were
      different between the current code and BASE_REV will have their diffs logged
      to OUTPUT_DIR.
    deps: [taskgraph-requirements]
    vars:
      OUTPUT_FILE: '{{.OUTPUT_FILE | default "./data/taskgraph.diff"}}'
      BASE_REV: '{{.BASE_REV | default "main"}}'
    cmds:
      - >-
        TASKCLUSTER_ROOT_URL=""
        poetry run --directory ./taskgraph --
        taskgraph full --json
        --parameters "taskcluster/test/params"
        --output-file "{{.OUTPUT_FILE}}"
        --diff "{{.BASE_REV}}"

  taskgraph-test:
    desc: Run tests and validations against task generation
    cmds:
      - >-
        poetry run --directory taskgraph --
        pytest taskcluster/test

  docs:
    desc: Run the GitHub pages Jekyll theme locally.
    cmds:
      - ./utils/tasks/serve-docs.sh

  preflight-check:
    desc: Perform pre-flight checks for a training run.
    deps: [poetry-install-utils]
    cmds:
      - poetry run python -W ignore utils/preflight_check.py {{.CLI_ARGS}}

  tensorboard:
    desc: Visualize training logs from task `download-logs` at http://localhost:6006
    summary: |
      Runs Tensorboard for Marian training logs in the ./data/taskcluster-logs directory.
      The logs are converted to tensorboard in the ./data/tensorboard-logs directory.
    deps: [poetry-install-tensorboard]
    cmds:
      - mkdir -p data/tensorboard-logs
      - >-
        poetry run marian-tensorboard
        --offline
        --log-file data/taskcluster-logs/**/*.log
        --work-dir data/tensorboard-logs

  find-corpus:
    desc: Finds all datasets for a language pair
    deps: [poetry-install-utils]
    cmds:
      - poetry run python -W ignore utils/find_corpus.py {{.CLI_ARGS}}

  run-model:
    desc: Run a Marian server that loads a model from data/models/$MODEL_TASK
    deps: [poetry-install-utils]
    cmds:
      - >-
        PYTHONPATH=$(pwd) poetry run python -W ignore utils/run_model.py {{.CLI_ARGS}}

  update-requirements:
    desc: Update the requirements.txt file for a pipeline script.
    summary: |
      Example usage:
      task update-requirements -- pipeline/eval/requirements/eval.in
    cmds:
      # Make sure a file was given to update.
      - >-
        if [[ -z "{{.CLI_ARGS}}" ]]; then
          echo "Provide a path to the .in file";
          echo "For example:"
          echo "task update-requirements -- pipeline/eval/requirements/eval.in";
          exit 1
        fi
      # Make sure the command is being run for docker
      - >-
        if [[ -z "$IS_DOCKER" ]]; then
          task docker-run -- task update-requirements -- {{.CLI_ARGS}} && exit
        fi
      # Make sure pip-tools are available in docker.
      - >-
        if ! command -v pip-compile &> /dev/null; then
          pip install pip-tools
        fi
      # Finally generate the hashes.
      - pip-compile --generate-hashes {{.CLI_ARGS}} --allow-unsafe