Updating with better deployment to azure devops kubernetes env. Adding better batch inputs that use nlp.pipe

This commit is contained in:
Kabir Khan 2019-10-21 13:53:31 -07:00
Родитель 4e9c1ba1f8
Коммит 40e59912fd
16 изменённых файлов: 379 добавлений и 89 удалений

Просмотреть файл

@ -1,5 +1,6 @@
{
"project_name": "Python Azure Search Cognitive Skill API",
"project_slug": "{{ cookiecutter.project_name.lower().replace(' ', '_').replace('-', '_') }}",
"project_short_description": "Python API that for Custom Cognitive Skills in Azure Search"
"project_short_description": "Python API that for Custom Cognitive Skills in Azure Search",
"project_language": "This must be one of spaCy's default languages. See https://spacy.io/usage for a supported list."
}

Просмотреть файл

@ -1,15 +1,17 @@
FROM python:3.6
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7
ENV PORT 8080
EXPOSE ${PORT}
WORKDIR /usr/src
ENV APP_MODULE app.api:app
ENV LOG_LEVEL debug
ENV WEB_CONCURRENCY 2
# Install spacy requirments separately first so that Docker will
# cache the (somewhat) expensive download of a spacy model
COPY ./requirements/spacy.txt ./requirements/spacy.txt
RUN pip install -r requirements/spacy.txt
RUN spacy download {{cookiecutter.project_language}}
COPY ./requirements/base.txt ./requirements/base.txt
RUN pip install -r requirements/base.txt
COPY ./app ./app
COPY start.sh .
CMD ["bash", "start.sh"]
COPY .env /app/.env
COPY ./app /app/app

Просмотреть файл

@ -22,7 +22,7 @@ To run locally in debug mode run:
```
cd ./{{cookiecutter.project_slug}}
uvicorn app.api:app --debug
uvicorn app.api:app --reload
```
Open your browser to http://localhost:8000/docs to view the SwaggerUI.

Просмотреть файл

@ -1,2 +1,2 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# Licensed under the MIT License.

Просмотреть файл

@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from collections import defaultdict
import os
from dotenv import load_dotenv, find_dotenv
@ -10,11 +11,17 @@ from starlette.responses import RedirectResponse
import spacy
import uvicorn
from app.models import RecordsRequest, RecordsResponse
from app.models import (
ENT_PROP_MAP,
RecordsRequest,
RecordsResponse,
RecordsEntitiesByTypeResponse,
)
from app.spacy_extractor import SpacyExtractor
load_dotenv(find_dotenv())
prefix = os.environ.get("CLUSTER_ROUTE_PREFIX")
prefix = os.getenv("CLUSTER_ROUTE_PREFIX")
if not prefix:
prefix = ""
prefix = prefix.rstrip("/")
@ -24,54 +31,85 @@ app = FastAPI(
title="{{cookiecutter.project_name}}",
version="1.0",
description="{{cookiecutter.project_short_description}}",
openapi_prefix=prefix
openapi_prefix=prefix,
)
nlp = spacy.load('en_core_web_sm')
nlp = spacy.load("{{cookiecutter.project_language}}")
extractor = SpacyExtractor(nlp)
def extract_from_text(text: str):
"""Extract Spacy Named Entities from raw text"""
entities = []
for ent in nlp(text).ents:
match = {
"text": ent.text,
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char,
}
entities.append(match)
return entities
ENT_PROP_MAP = {
"CARDINAL": "cardinals",
"DATE": "dates",
"EVENT": "events",
"FAC": "facilities",
"GPE": "gpes",
"LANGUAGE": "languages",
"LAW": "laws",
"LOC": "locations",
"MONEY": "money",
"NORP": "norps",
"ORDINAL": "ordinals",
"ORG": "organizations",
"PERCENT": "percentages",
"PERSON": "people",
"PRODUCT": "products",
"QUANTITY": "quanities",
"TIME": "times",
"WORK_OF_ART": "worksOfArt",
}
@app.get('/', include_in_schema=False)
@app.get("/", include_in_schema=False)
def docs_redirect():
return RedirectResponse(f'{prefix}/docs')
return RedirectResponse(f"{prefix}/docs")
@app.post("/entities", response_model=RecordsResponse, tags=["NER"])
async def extract_entities(body: RecordsRequest):
"""Extract Named Entities from a batch of Records."""
res = []
documents = []
for val in body.values:
documents.append({"id": val.recordId, "text": val.data.text})
entities_res = extractor.extract_entities(documents)
print(entities_res)
res = [
{"recordId": er["id"], "data": {"entities": er["entities"]}}
for er in entities_res
]
return {"values": res}
@app.post(
"/spacy_entities",
response_model=RecordsResponse,
tags=["NER", "Azure Search"],
"/entities_by_type", response_model=RecordsEntitiesByTypeResponse, tags=["NER"]
)
async def extract_entities(body: RecordsRequest):
"""Extract Named Entities from a batch of Records.
async def extract_entities_by_type(body: RecordsRequest):
"""Extract Named Entities from a batch of Records separated by entity label.
This route can be used directly as a Cognitive Skill in Azure Search
For Documentation on integration with Azure Search, see here:
https://docs.microsoft.com/en-us/azure/search/cognitive-search-custom-skill-interface"""
res = []
documents = []
for val in body.values:
ents = set([e["text"] for e in extract_from_text(val.data.text)])
ents = sorted(list(ents), key=lambda s: s.lower())
res.append(
{
"recordId": val.recordId,
"data": {
"entities": ents
}
}
)
return {
"values": res
}
documents.append({"id": val.recordId, "text": val.data.text})
entities_res = extractor.extract_entities(documents)
res = []
for er in entities_res:
groupby = defaultdict(list)
for ent in er["entities"]:
ent_prop = ENT_PROP_MAP[ent["label"]]
groupby[ent_prop].append(ent["name"])
record = {"recordId": er["id"], "data": groupby}
res.append(record)
return {"values": res}

Просмотреть файл

@ -5,6 +5,28 @@ from typing import Dict, List, Optional
from pydantic import BaseModel, Schema
ENT_PROP_MAP = {
"CARDINAL": "cardinals",
"DATE": "dates",
"EVENT": "events",
"FAC": "facilities",
"GPE": "gpes",
"LANGUAGE": "languages",
"LAW": "laws",
"LOC": "locations",
"MONEY": "money",
"NORP": "norps",
"ORDINAL": "ordinals",
"ORG": "organizations",
"PERCENT": "percentages",
"PERSON": "people",
"PRODUCT": "products",
"QUANTITY": "quanities",
"TIME": "times",
"WORK_OF_ART": "worksOfArt",
}
class RecordDataRequest(BaseModel):
text: str
language: str = "en"
@ -20,7 +42,7 @@ class RecordsRequest(BaseModel):
class RecordDataResponse(BaseModel):
entities: List[str]
entities: List
class Message(BaseModel):
@ -36,3 +58,12 @@ class RecordResponse(BaseModel):
class RecordsResponse(BaseModel):
values: List[RecordResponse]
class RecordEntitiesByTypeResponse(BaseModel):
recordId: str
data: Dict[str, List[str]]
class RecordsEntitiesByTypeResponse(BaseModel):
values: List[RecordEntitiesByTypeResponse]

Просмотреть файл

@ -0,0 +1,74 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from typing import Dict, List
import spacy
from spacy.language import Language
class SpacyExtractor:
"""class SpacyExtractor encapsulates logic to pipe Records with an id and text body
through a spacy model and return entities separated by Entity Type
"""
def __init__(
self, nlp: Language, input_id_col: str = "id", input_text_col: str = "text"
):
"""Initialize the SpacyExtractor pipeline.
nlp (spacy.language.Language): pre-loaded spacy language model
input_text_col (str): property on each document to run the model on
input_id_col (str): property on each document to correlate with request
RETURNS (EntityRecognizer): The newly constructed object.
"""
self.nlp = nlp
self.input_id_col = input_id_col
self.input_text_col = input_text_col
def _name_to_id(self, text: str):
"""Utility function to do a messy normalization of an entity name
text (str): text to create "id" from
"""
return "-".join([s.lower() for s in text.split()])
def extract_entities(self, records: List[Dict[str, str]]):
"""Apply the pre-trained model to a batch of records
records (list): The list of "document" dictionaries each with an
`id` and `text` property
RETURNS (list): List of responses containing the id of
the correlating document and a list of entities.
"""
ids = (doc[self.input_id_col] for doc in records)
texts = (doc[self.input_text_col] for doc in records)
res = []
for doc_id, spacy_doc in zip(ids, self.nlp.pipe(texts)):
entities = {}
for ent in spacy_doc.ents:
ent_id = ent.kb_id
if not ent_id:
ent_id = ent.ent_id
if not ent_id:
ent_id = self._name_to_id(ent.text)
if ent_id not in entities:
if ent.text.lower() == ent.text:
ent_name = ent.text.capitalize()
else:
ent_name = ent.text
entities[ent_id] = {
"name": ent_name,
"label": ent.label_,
"matches": [],
}
entities[ent_id]["matches"].append(
{"start": ent.start_char, "end": ent.end_char, "text": ent.text}
)
res.append({"id": doc_id, "entities": list(entities.values())})
return res

Просмотреть файл

@ -1,2 +1,2 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# Licensed under the MIT License.

Просмотреть файл

@ -21,17 +21,8 @@ def test_api():
software, which runs on its Echo and Dot devices, have clear leads in
consumer adoption."""
request_data = {
"values": [
{
"recordId": "a1",
"data": {
"text": text,
"language": "en"
}
}
]
"values": [{"recordId": "a1", "data": {"text": text, "language": "en"}}]
}
response = client.post("/spacy_entities", json=request_data)
@ -49,5 +40,5 @@ def test_api():
"Echo and Dot",
"Google",
"iPhones",
"Siri"
"Siri",
]

Просмотреть файл

@ -1,22 +1,111 @@
# Deploy to Azure Kubernetes Service
# Build and push image to Azure Container Registry; Deploy to Azure Kubernetes Service
# https://docs.microsoft.com/azure/devops/pipelines/languages/docker
trigger:
batch: true
branches:
include:
- master
- master
pool:
vmImage: 'ubuntu-16.04'
resources:
- repo: self
steps:
- script: |
docker build . -t {{cookiecutter.project_slug}}
displayName: Build Docker Container
- script: |
DOCKER_IMAGE=$(docker images -q | sed 1q)
docker run -i "$DOCKER_IMAGE" /bin/bash -c "black ./app --check"
displayName: Lint with Black
continueOnError: true
- script: |
DOCKER_IMAGE=$(docker images -q | sed 1q)
docker run -i -v junit:/usr/src "$DOCKER_IMAGE" /bin/bash -c "pytest app/tests --doctest-modules --junitxml=junit/test-results.xml --cov --cov-report=xml --cov-report=html"
displayName: Test with pytest
variables:
# Container registry service connection established during pipeline creation
# See: https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#sep-docreg
dockerRegistryServiceConnection: 'yourDockerRegistryServiceConnectionGUID'
imageRepository: 'yourImageRepositoryName'
containerRegistry: 'yourContainerRegistryName.azurecr.io'
dockerfilePath: '**/Dockerfile'
tag: '$(Build.BuildId)'
# Kubernetes Namespace
k8sNamespace: 'default'
imagePullSecret: 'yourImagePullSecretName-auth'
# See: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/environments-kubernetes?view=azure-devops
envName: 'yourEnvName-cluster.default'
# Agent VM image name
vmImageName: 'ubuntu-latest'
stages:
- stage: Build
displayName: Build stage
jobs:
- job: Build
displayName: Build job
pool:
vmImage: $(vmImageName)
steps:
- task: Docker@2
displayName: Build image
inputs:
command: build
repository: $(imageRepository)
dockerfile: $(dockerfilePath)
containerRegistry: $(dockerRegistryServiceConnection)
tags: |
$(tag)
- script: |
DOCKER_IMAGE=$(docker images -q | sed 1q)
docker run -i "$DOCKER_IMAGE" /bin/bash -c "black ./app --check"
displayName: Run black check
- script: |
DOCKER_IMAGE=$(docker images -q | sed 1q)
docker run -i -v junit:/app/junit "$DOCKER_IMAGE" /bin/bash -c "pytest app/tests --doctest-modules --junitxml=junit/test-results.xml --cov --cov-report=xml --cov-report=html"
displayName: Test with pytest
- task: PublishTestResults@2
inputs:
testResultsFiles: 'junit/test-results.xml'
testRunTitle: 'Publish test results'
- task: Docker@2
displayName: Push image to container registry
inputs:
command: buildAndPush
repository: $(imageRepository)
dockerfile: $(dockerfilePath)
containerRegistry: $(dockerRegistryServiceConnection)
tags: |
$(tag)
- task: PublishPipelineArtifact@0
inputs:
artifactName: 'manifests'
targetPath: 'manifests'
- stage: Deploy_BVT
displayName: Deploy BVT
dependsOn: Build
jobs:
- deployment: Deploy_BVT
pool:
vmImage: $(vmImageName)
environment: '$(envName)'
strategy:
runOnce:
deploy:
steps:
- task: DownloadPipelineArtifact@1
inputs:
artifactName: 'manifests'
downloadPath: '$(System.ArtifactsDirectory)/manifests'
- task: KubernetesManifest@0
displayName: Create imagePullSecret
inputs:
action: createSecret
secretName: $(imagePullSecret)
namespace: $(k8sNamespace)
dockerRegistryEndpoint: $(dockerRegistryServiceConnection)
- task: KubernetesManifest@0
displayName: Deploy to Kubernetes cluster
inputs:
action: deploy
namespace: $(k8sNamespace)
manifests: |
$(System.ArtifactsDirectory)/manifests/deployment.yml
$(System.ArtifactsDirectory)/manifests/service.yml
imagePullSecrets: |
$(imagePullSecret)
containers: |
$(containerRegistry)/$(imageRepository):$(tag)

Просмотреть файл

@ -0,0 +1,6 @@
import uvicorn
from app.api import app
if __name__ == '__main__':
uvicorn.run(app, host='0.0.0.0', port=8080, log_level='info')

Просмотреть файл

@ -0,0 +1,33 @@
# This Deployment Manifest is not parameterized so you'll need to update the app label,
# container image and name in order for your azure pipelines deployment to succeed
---
apiVersion: apps/v1beta1
kind: Deployment
metadata:
name: spacy-extractor
spec:
replicas: 3
strategy:
type: RollingUpdate
template:
metadata:
labels:
app: spacy-extractor
spec:
containers:
- name: spacy-extractor
image: yourContainerRegistryName.azurecr.io/spacyextractor
imagePullPolicy: Always
resources:
requests:
memory: 0.1G
cpu: 0.1
limits:
memory: 0.5G
cpu: 0.5
ports:
- name: http-api
containerPort: 8080
env:
- name: CLUSTER_ROUTE_PREFIX
value: /spacy/

Просмотреть файл

@ -0,0 +1,28 @@
# This service assumes you're using Ambassador as an API Gateway.
# See https://getambassador.io for more info on setting up Ambassador with Kubernetes
---
apiVersion: v1
kind: Service
metadata:
name: spacy-extractor
annotations:
getambassador.io/config: |
---
apiVersion: ambassador/v1
kind: Mapping
name: spacy-extractor-mapping
prefix: /spacy/
service: spacy-extractor
timeout_ms: 20000
bypass_auth: true
spec:
selector:
app: spacy-extractor
ports:
- port: 80
targetPort: http-api
name: http
- port: 443
targetPort: http-api
name: https
type: ClusterIP

Просмотреть файл

@ -1,7 +1,8 @@
fastapi
requests
uvicorn
python-dotenv
pytest
pytest-cov
black
spacy==2.2.0
fastapi==0.42.0
requests==2.22.0
uvicorn==0.9.1
python-dotenv==0.10.3
pytest==5.2.1
pytest-cov==2.8.1
black==19.3b0

Просмотреть файл

@ -1,2 +0,0 @@
spacy
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz

Просмотреть файл

@ -1,2 +0,0 @@
#!/bin/bash
uvicorn app.api:app --host 0.0.0.0 --port ${PORT}