зеркало из https://github.com/microsoft/presidio.git
Install transformers model into the docker image (#912)
This commit is contained in:
Родитель
5512a39ce7
Коммит
105d9455c5
|
@ -29,7 +29,7 @@ steps:
|
|||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
pip install pytest-azurepipelines
|
||||
python -m spacy download en_core_web_sm
|
||||
python -m spacy download en_core_web_lg
|
||||
workingDirectory: e2e-tests
|
||||
displayName: Install dependencies
|
||||
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
version: '3'
|
||||
services:
|
||||
presidio-anonymizer:
|
||||
image: ${REGISTRY_NAME}${IMAGE_PREFIX}presidio-anonymizer${TAG}
|
||||
build:
|
||||
context: ./presidio-anonymizer
|
||||
args:
|
||||
- NAME=presidio-anonymizer
|
||||
environment:
|
||||
- PORT=5001
|
||||
ports:
|
||||
- "5001:5001"
|
||||
presidio-analyzer:
|
||||
image: ${REGISTRY_NAME}${IMAGE_PREFIX}presidio-analyzer${TAG}
|
||||
build:
|
||||
context: ./presidio-analyzer
|
||||
args:
|
||||
- NAME=presidio-analyzer
|
||||
- NLP_CONF_FILE=conf/transformers.yaml
|
||||
dockerfile: Dockerfile.transformers
|
||||
environment:
|
||||
- PORT=5001
|
||||
ports:
|
||||
- "5002:5001"
|
|
@ -253,7 +253,7 @@ def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_s
|
|||
# Create configuration containing engine name and models
|
||||
configuration = {
|
||||
"nlp_engine_name": "spacy",
|
||||
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
|
||||
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
|
||||
}
|
||||
|
||||
# Create NLP engine based on configuration
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
FROM python:3.9-slim
|
||||
|
||||
ARG NAME
|
||||
ARG NLP_CONF_FILE=conf/transformers.yaml
|
||||
ENV PIPENV_VENV_IN_PROJECT=1
|
||||
ENV PIP_NO_CACHE_DIR=1
|
||||
WORKDIR /usr/bin/${NAME}
|
||||
|
||||
COPY ./Pipfile* /usr/bin/${NAME}/
|
||||
RUN pip install pipenv \
|
||||
&& pipenv sync
|
||||
RUN pipenv install torch transformers huggingface_hub --skip-lock
|
||||
|
||||
# install nlp models specified in conf/default.yaml
|
||||
COPY ./install_nlp_models.py /usr/bin/${NAME}/
|
||||
COPY ${NLP_CONF_FILE} /usr/bin/${NAME}/${NLP_CONF_FILE}
|
||||
|
||||
RUN pipenv run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}
|
||||
|
||||
COPY . /usr/bin/${NAME}/
|
||||
EXPOSE ${PORT}
|
||||
CMD pipenv run python app.py --host 0.0.0.0
|
|
@ -3,4 +3,3 @@ models:
|
|||
-
|
||||
lang_code: en
|
||||
model_name: en_core_web_lg
|
||||
|
||||
|
|
|
@ -2,4 +2,4 @@ nlp_engine_name: spacy
|
|||
models:
|
||||
-
|
||||
lang_code: en
|
||||
model_name: en_core_web_sm
|
||||
model_name: en_core_web_lg
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
"""Install the default NLP models defined in the provided yaml file."""
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import logging
|
||||
from typing import Union, Dict
|
||||
|
||||
import spacy
|
||||
import yaml
|
||||
|
||||
|
@ -11,13 +13,26 @@ except ImportError:
|
|||
# stanza should be installed manually
|
||||
stanza = None
|
||||
|
||||
try:
|
||||
import transformers
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
||||
except ImportError:
|
||||
# transformers should be installed manually
|
||||
transformers = None
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel("INFO")
|
||||
logger.addHandler(logging.StreamHandler())
|
||||
|
||||
|
||||
def install_models(conf_file: str) -> None:
|
||||
"""Installs models in conf/default.yaml."""
|
||||
"""Installs models in conf/default.yaml.
|
||||
|
||||
:param conf_file: Path to the yaml file containing the models to install.
|
||||
See examples in the conf directory.
|
||||
"""
|
||||
|
||||
nlp_configuration = yaml.safe_load(open(conf_file))
|
||||
|
||||
logger.info(f"Installing models from configuration: {nlp_configuration}")
|
||||
|
@ -36,10 +51,7 @@ def install_models(conf_file: str) -> None:
|
|||
logger.info("finished installing models")
|
||||
|
||||
|
||||
def _download_model(engine_name: str, model_name: str) -> None:
|
||||
if engine_name not in ("spacy", "stanza"):
|
||||
raise ValueError(f"Unsupported nlp engine: {engine_name}")
|
||||
|
||||
def _download_model(engine_name: str, model_name: Union[str, Dict[str, str]]) -> None:
|
||||
if engine_name == "spacy":
|
||||
spacy.cli.download(model_name)
|
||||
elif engine_name == "stanza":
|
||||
|
@ -47,6 +59,40 @@ def _download_model(engine_name: str, model_name: str) -> None:
|
|||
stanza.download(model_name)
|
||||
else:
|
||||
raise ImportError("stanza is not installed")
|
||||
elif engine_name == "transformers":
|
||||
if transformers:
|
||||
_install_transformers_spacy_models(model_name)
|
||||
else:
|
||||
raise ImportError("transformers is not installed")
|
||||
else:
|
||||
raise ValueError(f"Unsupported nlp engine: {engine_name}")
|
||||
|
||||
|
||||
def _install_transformers_spacy_models(model_name: Dict[str, str]) -> None:
|
||||
if "spacy" not in model_name:
|
||||
raise ValueError(
|
||||
"transformers config should contain "
|
||||
"a spacy model/pipeline such as en_core_web_sm"
|
||||
)
|
||||
if "transformers" not in model_name:
|
||||
raise ValueError(
|
||||
"transformers config should contain a path to a transformers model"
|
||||
)
|
||||
|
||||
spacy_model = model_name["spacy"]
|
||||
transformers_model = model_name["transformers"]
|
||||
|
||||
# download spacy model/pipeline
|
||||
logger.info(f"Installing spaCy model: {spacy_model}")
|
||||
spacy.cli.download(spacy_model)
|
||||
|
||||
# download transformers model
|
||||
logger.info(f"Installing transformers model: {transformers_model}")
|
||||
snapshot_download(repo_id=transformers_model)
|
||||
|
||||
# Instantiate to make sure it's downloaded during installation and not runtime
|
||||
AutoTokenizer.from_pretrained(transformers_model)
|
||||
AutoModelForTokenClassification.from_pretrained(transformers_model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -9,6 +9,7 @@ from presidio_analyzer.nlp_engine import SpacyNlpEngine
|
|||
|
||||
|
||||
try:
|
||||
import torch
|
||||
import transformers
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
|
@ -16,6 +17,7 @@ try:
|
|||
pipeline,
|
||||
)
|
||||
except ImportError:
|
||||
torch = None
|
||||
transformers = None
|
||||
|
||||
logger = logging.getLogger("presidio-analyzer")
|
||||
|
|
|
@ -52,6 +52,15 @@ def nlp_engines(request, nlp_engine_provider) -> Dict[str, NlpEngine]:
|
|||
for name, engine_cls in nlp_engines.items():
|
||||
if name == "spacy" and not request.config.getoption("--runfast"):
|
||||
available_engines[f"{name}_en"] = engine_cls({"en": "en_core_web_lg"})
|
||||
elif name == "transformers" and not request.config.getoption("--runfast"):
|
||||
available_engines[f"{name}_en"] = engine_cls(
|
||||
{
|
||||
"en": {
|
||||
"spacy": "en_core_web_lg",
|
||||
"transformers": "dslim/bert-base-NER",
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
available_engines[f"{name}_en"] = engine_cls()
|
||||
|
||||
|
|
|
@ -47,3 +47,16 @@ class ImageRecognizerResult(RecognizerResult):
|
|||
equal_box2 = (self.width == other.width) and (self.height == other.height)
|
||||
equal_box = equal_box1 and equal_box2
|
||||
return equal_type and equal_pos and equal_score and equal_box
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return a string representation of the instance."""
|
||||
return (
|
||||
f"type: {self.entity_type}, "
|
||||
f"start: {self.start}, "
|
||||
f"end: {self.end}, "
|
||||
f"score: {self.score}, "
|
||||
f"left: {self.left}, "
|
||||
f"top: {self.top}, "
|
||||
f"width: {self.width}, "
|
||||
f"height: {self.height}"
|
||||
)
|
||||
|
|
Загрузка…
Ссылка в новой задаче