Install transformers model into the docker image (#912)

This commit is contained in:
Omri Mendels 2023-01-25 09:06:02 +02:00 коммит произвёл GitHub
Родитель 5512a39ce7
Коммит 105d9455c5
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
10 изменённых файлов: 125 добавлений и 10 удалений

Просмотреть файл

@ -29,7 +29,7 @@ steps:
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest-azurepipelines
python -m spacy download en_core_web_sm
python -m spacy download en_core_web_lg
workingDirectory: e2e-tests
displayName: Install dependencies

Просмотреть файл

@ -0,0 +1,24 @@
version: '3'
services:
presidio-anonymizer:
image: ${REGISTRY_NAME}${IMAGE_PREFIX}presidio-anonymizer${TAG}
build:
context: ./presidio-anonymizer
args:
- NAME=presidio-anonymizer
environment:
- PORT=5001
ports:
- "5001:5001"
presidio-analyzer:
image: ${REGISTRY_NAME}${IMAGE_PREFIX}presidio-analyzer${TAG}
build:
context: ./presidio-analyzer
args:
- NAME=presidio-analyzer
- NLP_CONF_FILE=conf/transformers.yaml
dockerfile: Dockerfile.transformers
environment:
- PORT=5001
ports:
- "5002:5001"

Просмотреть файл

@ -253,7 +253,7 @@ def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_s
# Create configuration containing engine name and models
configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}
# Create NLP engine based on configuration

Просмотреть файл

@ -0,0 +1,22 @@
FROM python:3.9-slim
ARG NAME
ARG NLP_CONF_FILE=conf/transformers.yaml
ENV PIPENV_VENV_IN_PROJECT=1
ENV PIP_NO_CACHE_DIR=1
WORKDIR /usr/bin/${NAME}
COPY ./Pipfile* /usr/bin/${NAME}/
RUN pip install pipenv \
&& pipenv sync
RUN pipenv install torch transformers huggingface_hub --skip-lock
# install nlp models specified in conf/default.yaml
COPY ./install_nlp_models.py /usr/bin/${NAME}/
COPY ${NLP_CONF_FILE} /usr/bin/${NAME}/${NLP_CONF_FILE}
RUN pipenv run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}
COPY . /usr/bin/${NAME}/
EXPOSE ${PORT}
CMD pipenv run python app.py --host 0.0.0.0

Просмотреть файл

@ -3,4 +3,3 @@ models:
-
lang_code: en
model_name: en_core_web_lg

Просмотреть файл

@ -2,4 +2,4 @@ nlp_engine_name: spacy
models:
-
lang_code: en
model_name: en_core_web_sm
model_name: en_core_web_lg

Просмотреть файл

@ -1,7 +1,9 @@
"""Install the default NLP models defined in the provided yaml file."""
import logging
import argparse
import logging
from typing import Union, Dict
import spacy
import yaml
@ -11,13 +13,26 @@ except ImportError:
# stanza should be installed manually
stanza = None
try:
import transformers
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForTokenClassification
except ImportError:
# transformers should be installed manually
transformers = None
logger = logging.getLogger()
logger.setLevel("INFO")
logger.addHandler(logging.StreamHandler())
def install_models(conf_file: str) -> None:
"""Installs models in conf/default.yaml."""
"""Installs models in conf/default.yaml.
:param conf_file: Path to the yaml file containing the models to install.
See examples in the conf directory.
"""
nlp_configuration = yaml.safe_load(open(conf_file))
logger.info(f"Installing models from configuration: {nlp_configuration}")
@ -36,10 +51,7 @@ def install_models(conf_file: str) -> None:
logger.info("finished installing models")
def _download_model(engine_name: str, model_name: str) -> None:
if engine_name not in ("spacy", "stanza"):
raise ValueError(f"Unsupported nlp engine: {engine_name}")
def _download_model(engine_name: str, model_name: Union[str, Dict[str, str]]) -> None:
if engine_name == "spacy":
spacy.cli.download(model_name)
elif engine_name == "stanza":
@ -47,6 +59,40 @@ def _download_model(engine_name: str, model_name: str) -> None:
stanza.download(model_name)
else:
raise ImportError("stanza is not installed")
elif engine_name == "transformers":
if transformers:
_install_transformers_spacy_models(model_name)
else:
raise ImportError("transformers is not installed")
else:
raise ValueError(f"Unsupported nlp engine: {engine_name}")
def _install_transformers_spacy_models(model_name: Dict[str, str]) -> None:
if "spacy" not in model_name:
raise ValueError(
"transformers config should contain "
"a spacy model/pipeline such as en_core_web_sm"
)
if "transformers" not in model_name:
raise ValueError(
"transformers config should contain a path to a transformers model"
)
spacy_model = model_name["spacy"]
transformers_model = model_name["transformers"]
# download spacy model/pipeline
logger.info(f"Installing spaCy model: {spacy_model}")
spacy.cli.download(spacy_model)
# download transformers model
logger.info(f"Installing transformers model: {transformers_model}")
snapshot_download(repo_id=transformers_model)
# Instantiate to make sure it's downloaded during installation and not runtime
AutoTokenizer.from_pretrained(transformers_model)
AutoModelForTokenClassification.from_pretrained(transformers_model)
if __name__ == "__main__":

Просмотреть файл

@ -9,6 +9,7 @@ from presidio_analyzer.nlp_engine import SpacyNlpEngine
try:
import torch
import transformers
from transformers import (
AutoTokenizer,
@ -16,6 +17,7 @@ try:
pipeline,
)
except ImportError:
torch = None
transformers = None
logger = logging.getLogger("presidio-analyzer")

Просмотреть файл

@ -52,6 +52,15 @@ def nlp_engines(request, nlp_engine_provider) -> Dict[str, NlpEngine]:
for name, engine_cls in nlp_engines.items():
if name == "spacy" and not request.config.getoption("--runfast"):
available_engines[f"{name}_en"] = engine_cls({"en": "en_core_web_lg"})
elif name == "transformers" and not request.config.getoption("--runfast"):
available_engines[f"{name}_en"] = engine_cls(
{
"en": {
"spacy": "en_core_web_lg",
"transformers": "dslim/bert-base-NER",
}
}
)
else:
available_engines[f"{name}_en"] = engine_cls()

Просмотреть файл

@ -47,3 +47,16 @@ class ImageRecognizerResult(RecognizerResult):
equal_box2 = (self.width == other.width) and (self.height == other.height)
equal_box = equal_box1 and equal_box2
return equal_type and equal_pos and equal_score and equal_box
def __str__(self) -> str:
"""Return a string representation of the instance."""
return (
f"type: {self.entity_type}, "
f"start: {self.start}, "
f"end: {self.end}, "
f"score: {self.score}, "
f"left: {self.left}, "
f"top: {self.top}, "
f"width: {self.width}, "
f"height: {self.height}"
)