Install transformers model into the docker image (#912)

2023-01-25 09:06:02 +02:00 · 2023-01-25 09:06:02 +02:00 · 105d9455c5
--- a/.pipelines/templates/e2e-tests.yml
+++ b/.pipelines/templates/e2e-tests.yml
@ -29,7 +29,7 @@ steps:
          python -m pip install --upgrade pip
          pip install -r requirements.txt
          pip install pytest-azurepipelines
-          python -m spacy download en_core_web_sm
+          python -m spacy download en_core_web_lg
      workingDirectory: e2e-tests
      displayName: Install dependencies

--- a/docker-compose-transformers.yml
+++ b/docker-compose-transformers.yml
@ -0,0 +1,24 @@
+version: '3'
+services:
+  presidio-anonymizer:
+    image: ${REGISTRY_NAME}${IMAGE_PREFIX}presidio-anonymizer${TAG}
+    build:
+      context: ./presidio-anonymizer
+      args:
+        - NAME=presidio-anonymizer
+    environment:
+      - PORT=5001
+    ports:
+      - "5001:5001"
+  presidio-analyzer:
+    image: ${REGISTRY_NAME}${IMAGE_PREFIX}presidio-analyzer${TAG}
+    build:
+      context: ./presidio-analyzer
+      args:
+        - NAME=presidio-analyzer
+        - NLP_CONF_FILE=conf/transformers.yaml
+      dockerfile: Dockerfile.transformers
+    environment:
+      - PORT=5001
+    ports:
+      - "5002:5001"
--- a/e2e-tests/tests/test_e2e_integration_flows.py
+++ b/e2e-tests/tests/test_e2e_integration_flows.py
@ -253,7 +253,7 @@ def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_s
    # Create configuration containing engine name and models
    configuration = {
        "nlp_engine_name": "spacy",
-        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+        "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
    }

    # Create NLP engine based on configuration
--- a/presidio-analyzer/Dockerfile.transformers
+++ b/presidio-analyzer/Dockerfile.transformers
@ -0,0 +1,22 @@
+FROM python:3.9-slim
+
+ARG NAME
+ARG NLP_CONF_FILE=conf/transformers.yaml
+ENV PIPENV_VENV_IN_PROJECT=1
+ENV PIP_NO_CACHE_DIR=1
+WORKDIR /usr/bin/${NAME}
+
+COPY ./Pipfile* /usr/bin/${NAME}/
+RUN pip install pipenv \
+  && pipenv sync
+RUN pipenv install torch transformers huggingface_hub --skip-lock
+
+# install nlp models specified in conf/default.yaml
+COPY ./install_nlp_models.py /usr/bin/${NAME}/
+COPY ${NLP_CONF_FILE} /usr/bin/${NAME}/${NLP_CONF_FILE}
+
+RUN pipenv run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}
+
+COPY . /usr/bin/${NAME}/
+EXPOSE ${PORT}
+CMD pipenv run python app.py --host 0.0.0.0
--- a/presidio-analyzer/conf/default.yaml
+++ b/presidio-analyzer/conf/default.yaml
@ -3,4 +3,3 @@ models:
  -
    lang_code: en
    model_name: en_core_web_lg
-
--- a/presidio-analyzer/conf/spacy.yaml
+++ b/presidio-analyzer/conf/spacy.yaml
@ -2,4 +2,4 @@ nlp_engine_name: spacy
 models:
  -
    lang_code: en
-    model_name: en_core_web_sm
+    model_name: en_core_web_lg
--- a/presidio-analyzer/install_nlp_models.py
+++ b/presidio-analyzer/install_nlp_models.py
@ -1,7 +1,9 @@
 """Install the default NLP models defined in the provided yaml file."""

-import logging
 import argparse
+import logging
+from typing import Union, Dict
+
 import spacy
 import yaml

@ -11,13 +13,26 @@ except ImportError:
    # stanza should be installed manually
    stanza = None

+try:
+    import transformers
+    from huggingface_hub import snapshot_download
+    from transformers import AutoTokenizer, AutoModelForTokenClassification
+except ImportError:
+    # transformers should be installed manually
+    transformers = None
+
 logger = logging.getLogger()
 logger.setLevel("INFO")
 logger.addHandler(logging.StreamHandler())


 def install_models(conf_file: str) -> None:
-    """Installs models in conf/default.yaml."""
+    """Installs models in conf/default.yaml.
+
+    :param conf_file: Path to the yaml file containing the models to install.
+    See examples in the conf directory.
+    """
+
    nlp_configuration = yaml.safe_load(open(conf_file))

    logger.info(f"Installing models from configuration: {nlp_configuration}")
@ -36,10 +51,7 @@ def install_models(conf_file: str) -> None:
    logger.info("finished installing models")


-def _download_model(engine_name: str, model_name: str) -> None:
-    if engine_name not in ("spacy", "stanza"):
-        raise ValueError(f"Unsupported nlp engine: {engine_name}")
-
+def _download_model(engine_name: str, model_name: Union[str, Dict[str, str]]) -> None:
    if engine_name == "spacy":
        spacy.cli.download(model_name)
    elif engine_name == "stanza":
@ -47,6 +59,40 @@ def _download_model(engine_name: str, model_name: str) -> None:
            stanza.download(model_name)
        else:
            raise ImportError("stanza is not installed")
+    elif engine_name == "transformers":
+        if transformers:
+            _install_transformers_spacy_models(model_name)
+        else:
+            raise ImportError("transformers is not installed")
+    else:
+        raise ValueError(f"Unsupported nlp engine: {engine_name}")
+
+
+def _install_transformers_spacy_models(model_name: Dict[str, str]) -> None:
+    if "spacy" not in model_name:
+        raise ValueError(
+            "transformers config should contain "
+            "a spacy model/pipeline such as en_core_web_sm"
+        )
+    if "transformers" not in model_name:
+        raise ValueError(
+            "transformers config should contain a path to a transformers model"
+        )
+
+    spacy_model = model_name["spacy"]
+    transformers_model = model_name["transformers"]
+
+    # download spacy model/pipeline
+    logger.info(f"Installing spaCy model: {spacy_model}")
+    spacy.cli.download(spacy_model)
+
+    # download transformers model
+    logger.info(f"Installing transformers model: {transformers_model}")
+    snapshot_download(repo_id=transformers_model)
+
+    # Instantiate to make sure it's downloaded during installation and not runtime
+    AutoTokenizer.from_pretrained(transformers_model)
+    AutoModelForTokenClassification.from_pretrained(transformers_model)


 if __name__ == "__main__":
--- a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py
+++ b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py
@ -9,6 +9,7 @@ from presidio_analyzer.nlp_engine import SpacyNlpEngine


 try:
+    import torch
    import transformers
    from transformers import (
        AutoTokenizer,
@ -16,6 +17,7 @@ try:
        pipeline,
    )
 except ImportError:
+    torch = None
    transformers = None

 logger = logging.getLogger("presidio-analyzer")
--- a/presidio-analyzer/tests/conftest.py
+++ b/presidio-analyzer/tests/conftest.py
@ -52,6 +52,15 @@ def nlp_engines(request, nlp_engine_provider) -> Dict[str, NlpEngine]:
    for name, engine_cls in nlp_engines.items():
        if name == "spacy" and not request.config.getoption("--runfast"):
            available_engines[f"{name}_en"] = engine_cls({"en": "en_core_web_lg"})
+        elif name == "transformers" and not request.config.getoption("--runfast"):
+            available_engines[f"{name}_en"] = engine_cls(
+                {
+                    "en": {
+                        "spacy": "en_core_web_lg",
+                        "transformers": "dslim/bert-base-NER",
+                    }
+                }
+            )
        else:
            available_engines[f"{name}_en"] = engine_cls()

--- a/presidio-image-redactor/presidio_image_redactor/entities/image_recognizer_result.py
+++ b/presidio-image-redactor/presidio_image_redactor/entities/image_recognizer_result.py
@ -47,3 +47,16 @@ class ImageRecognizerResult(RecognizerResult):
        equal_box2 = (self.width == other.width) and (self.height == other.height)
        equal_box = equal_box1 and equal_box2
        return equal_type and equal_pos and equal_score and equal_box
+
+    def __str__(self) -> str:
+        """Return a string representation of the instance."""
+        return (
+            f"type: {self.entity_type}, "
+            f"start: {self.start}, "
+            f"end: {self.end}, "
+            f"score: {self.score}, "
+            f"left: {self.left}, "
+            f"top: {self.top}, "
+            f"width: {self.width}, "
+            f"height: {self.height}"
+        )