Updating with better deployment to azure devops kubernetes env. Adding better batch inputs that use nlp.pipe
This commit is contained in:
Родитель
4e9c1ba1f8
Коммит
40e59912fd
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
"project_name": "Python Azure Search Cognitive Skill API",
|
||||
"project_slug": "{{ cookiecutter.project_name.lower().replace(' ', '_').replace('-', '_') }}",
|
||||
"project_short_description": "Python API that for Custom Cognitive Skills in Azure Search"
|
||||
"project_short_description": "Python API that for Custom Cognitive Skills in Azure Search",
|
||||
"project_language": "This must be one of spaCy's default languages. See https://spacy.io/usage for a supported list."
|
||||
}
|
|
@ -1,15 +1,17 @@
|
|||
FROM python:3.6
|
||||
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7
|
||||
ENV PORT 8080
|
||||
EXPOSE ${PORT}
|
||||
WORKDIR /usr/src
|
||||
ENV APP_MODULE app.api:app
|
||||
ENV LOG_LEVEL debug
|
||||
ENV WEB_CONCURRENCY 2
|
||||
|
||||
# Install spacy requirments separately first so that Docker will
|
||||
# cache the (somewhat) expensive download of a spacy model
|
||||
COPY ./requirements/spacy.txt ./requirements/spacy.txt
|
||||
RUN pip install -r requirements/spacy.txt
|
||||
RUN spacy download {{cookiecutter.project_language}}
|
||||
|
||||
COPY ./requirements/base.txt ./requirements/base.txt
|
||||
RUN pip install -r requirements/base.txt
|
||||
COPY ./app ./app
|
||||
|
||||
COPY start.sh .
|
||||
|
||||
CMD ["bash", "start.sh"]
|
||||
COPY .env /app/.env
|
||||
COPY ./app /app/app
|
||||
|
|
|
@ -22,7 +22,7 @@ To run locally in debug mode run:
|
|||
|
||||
```
|
||||
cd ./{{cookiecutter.project_slug}}
|
||||
uvicorn app.api:app --debug
|
||||
uvicorn app.api:app --reload
|
||||
```
|
||||
Open your browser to http://localhost:8000/docs to view the SwaggerUI.
|
||||
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# Licensed under the MIT License.
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from collections import defaultdict
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv, find_dotenv
|
||||
|
@ -10,11 +11,17 @@ from starlette.responses import RedirectResponse
|
|||
import spacy
|
||||
import uvicorn
|
||||
|
||||
from app.models import RecordsRequest, RecordsResponse
|
||||
from app.models import (
|
||||
ENT_PROP_MAP,
|
||||
RecordsRequest,
|
||||
RecordsResponse,
|
||||
RecordsEntitiesByTypeResponse,
|
||||
)
|
||||
from app.spacy_extractor import SpacyExtractor
|
||||
|
||||
|
||||
load_dotenv(find_dotenv())
|
||||
prefix = os.environ.get("CLUSTER_ROUTE_PREFIX")
|
||||
prefix = os.getenv("CLUSTER_ROUTE_PREFIX")
|
||||
if not prefix:
|
||||
prefix = ""
|
||||
prefix = prefix.rstrip("/")
|
||||
|
@ -24,54 +31,85 @@ app = FastAPI(
|
|||
title="{{cookiecutter.project_name}}",
|
||||
version="1.0",
|
||||
description="{{cookiecutter.project_short_description}}",
|
||||
openapi_prefix=prefix
|
||||
openapi_prefix=prefix,
|
||||
)
|
||||
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
nlp = spacy.load("{{cookiecutter.project_language}}")
|
||||
extractor = SpacyExtractor(nlp)
|
||||
|
||||
|
||||
def extract_from_text(text: str):
|
||||
"""Extract Spacy Named Entities from raw text"""
|
||||
entities = []
|
||||
for ent in nlp(text).ents:
|
||||
match = {
|
||||
"text": ent.text,
|
||||
"label": ent.label_,
|
||||
"start": ent.start_char,
|
||||
"end": ent.end_char,
|
||||
}
|
||||
entities.append(match)
|
||||
return entities
|
||||
ENT_PROP_MAP = {
|
||||
"CARDINAL": "cardinals",
|
||||
"DATE": "dates",
|
||||
"EVENT": "events",
|
||||
"FAC": "facilities",
|
||||
"GPE": "gpes",
|
||||
"LANGUAGE": "languages",
|
||||
"LAW": "laws",
|
||||
"LOC": "locations",
|
||||
"MONEY": "money",
|
||||
"NORP": "norps",
|
||||
"ORDINAL": "ordinals",
|
||||
"ORG": "organizations",
|
||||
"PERCENT": "percentages",
|
||||
"PERSON": "people",
|
||||
"PRODUCT": "products",
|
||||
"QUANTITY": "quanities",
|
||||
"TIME": "times",
|
||||
"WORK_OF_ART": "worksOfArt",
|
||||
}
|
||||
|
||||
|
||||
@app.get('/', include_in_schema=False)
|
||||
@app.get("/", include_in_schema=False)
|
||||
def docs_redirect():
|
||||
return RedirectResponse(f'{prefix}/docs')
|
||||
return RedirectResponse(f"{prefix}/docs")
|
||||
|
||||
|
||||
@app.post("/entities", response_model=RecordsResponse, tags=["NER"])
|
||||
async def extract_entities(body: RecordsRequest):
|
||||
"""Extract Named Entities from a batch of Records."""
|
||||
|
||||
res = []
|
||||
documents = []
|
||||
|
||||
for val in body.values:
|
||||
documents.append({"id": val.recordId, "text": val.data.text})
|
||||
|
||||
entities_res = extractor.extract_entities(documents)
|
||||
print(entities_res)
|
||||
|
||||
res = [
|
||||
{"recordId": er["id"], "data": {"entities": er["entities"]}}
|
||||
for er in entities_res
|
||||
]
|
||||
|
||||
return {"values": res}
|
||||
|
||||
|
||||
@app.post(
|
||||
"/spacy_entities",
|
||||
response_model=RecordsResponse,
|
||||
tags=["NER", "Azure Search"],
|
||||
"/entities_by_type", response_model=RecordsEntitiesByTypeResponse, tags=["NER"]
|
||||
)
|
||||
async def extract_entities(body: RecordsRequest):
|
||||
"""Extract Named Entities from a batch of Records.
|
||||
async def extract_entities_by_type(body: RecordsRequest):
|
||||
"""Extract Named Entities from a batch of Records separated by entity label.
|
||||
This route can be used directly as a Cognitive Skill in Azure Search
|
||||
For Documentation on integration with Azure Search, see here:
|
||||
https://docs.microsoft.com/en-us/azure/search/cognitive-search-custom-skill-interface"""
|
||||
|
||||
res = []
|
||||
documents = []
|
||||
|
||||
for val in body.values:
|
||||
ents = set([e["text"] for e in extract_from_text(val.data.text)])
|
||||
ents = sorted(list(ents), key=lambda s: s.lower())
|
||||
res.append(
|
||||
{
|
||||
"recordId": val.recordId,
|
||||
"data": {
|
||||
"entities": ents
|
||||
}
|
||||
}
|
||||
)
|
||||
return {
|
||||
"values": res
|
||||
}
|
||||
documents.append({"id": val.recordId, "text": val.data.text})
|
||||
|
||||
entities_res = extractor.extract_entities(documents)
|
||||
res = []
|
||||
|
||||
for er in entities_res:
|
||||
groupby = defaultdict(list)
|
||||
for ent in er["entities"]:
|
||||
ent_prop = ENT_PROP_MAP[ent["label"]]
|
||||
groupby[ent_prop].append(ent["name"])
|
||||
record = {"recordId": er["id"], "data": groupby}
|
||||
res.append(record)
|
||||
|
||||
return {"values": res}
|
||||
|
|
|
@ -5,6 +5,28 @@ from typing import Dict, List, Optional
|
|||
from pydantic import BaseModel, Schema
|
||||
|
||||
|
||||
ENT_PROP_MAP = {
|
||||
"CARDINAL": "cardinals",
|
||||
"DATE": "dates",
|
||||
"EVENT": "events",
|
||||
"FAC": "facilities",
|
||||
"GPE": "gpes",
|
||||
"LANGUAGE": "languages",
|
||||
"LAW": "laws",
|
||||
"LOC": "locations",
|
||||
"MONEY": "money",
|
||||
"NORP": "norps",
|
||||
"ORDINAL": "ordinals",
|
||||
"ORG": "organizations",
|
||||
"PERCENT": "percentages",
|
||||
"PERSON": "people",
|
||||
"PRODUCT": "products",
|
||||
"QUANTITY": "quanities",
|
||||
"TIME": "times",
|
||||
"WORK_OF_ART": "worksOfArt",
|
||||
}
|
||||
|
||||
|
||||
class RecordDataRequest(BaseModel):
|
||||
text: str
|
||||
language: str = "en"
|
||||
|
@ -20,7 +42,7 @@ class RecordsRequest(BaseModel):
|
|||
|
||||
|
||||
class RecordDataResponse(BaseModel):
|
||||
entities: List[str]
|
||||
entities: List
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
|
@ -36,3 +58,12 @@ class RecordResponse(BaseModel):
|
|||
|
||||
class RecordsResponse(BaseModel):
|
||||
values: List[RecordResponse]
|
||||
|
||||
|
||||
class RecordEntitiesByTypeResponse(BaseModel):
|
||||
recordId: str
|
||||
data: Dict[str, List[str]]
|
||||
|
||||
|
||||
class RecordsEntitiesByTypeResponse(BaseModel):
|
||||
values: List[RecordEntitiesByTypeResponse]
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from typing import Dict, List
|
||||
import spacy
|
||||
from spacy.language import Language
|
||||
|
||||
|
||||
class SpacyExtractor:
|
||||
"""class SpacyExtractor encapsulates logic to pipe Records with an id and text body
|
||||
through a spacy model and return entities separated by Entity Type
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, nlp: Language, input_id_col: str = "id", input_text_col: str = "text"
|
||||
):
|
||||
"""Initialize the SpacyExtractor pipeline.
|
||||
|
||||
nlp (spacy.language.Language): pre-loaded spacy language model
|
||||
input_text_col (str): property on each document to run the model on
|
||||
input_id_col (str): property on each document to correlate with request
|
||||
|
||||
RETURNS (EntityRecognizer): The newly constructed object.
|
||||
"""
|
||||
self.nlp = nlp
|
||||
self.input_id_col = input_id_col
|
||||
self.input_text_col = input_text_col
|
||||
|
||||
def _name_to_id(self, text: str):
|
||||
"""Utility function to do a messy normalization of an entity name
|
||||
|
||||
text (str): text to create "id" from
|
||||
"""
|
||||
return "-".join([s.lower() for s in text.split()])
|
||||
|
||||
def extract_entities(self, records: List[Dict[str, str]]):
|
||||
"""Apply the pre-trained model to a batch of records
|
||||
|
||||
records (list): The list of "document" dictionaries each with an
|
||||
`id` and `text` property
|
||||
|
||||
RETURNS (list): List of responses containing the id of
|
||||
the correlating document and a list of entities.
|
||||
"""
|
||||
ids = (doc[self.input_id_col] for doc in records)
|
||||
texts = (doc[self.input_text_col] for doc in records)
|
||||
|
||||
res = []
|
||||
|
||||
for doc_id, spacy_doc in zip(ids, self.nlp.pipe(texts)):
|
||||
entities = {}
|
||||
for ent in spacy_doc.ents:
|
||||
ent_id = ent.kb_id
|
||||
if not ent_id:
|
||||
ent_id = ent.ent_id
|
||||
if not ent_id:
|
||||
ent_id = self._name_to_id(ent.text)
|
||||
|
||||
if ent_id not in entities:
|
||||
if ent.text.lower() == ent.text:
|
||||
ent_name = ent.text.capitalize()
|
||||
else:
|
||||
ent_name = ent.text
|
||||
entities[ent_id] = {
|
||||
"name": ent_name,
|
||||
"label": ent.label_,
|
||||
"matches": [],
|
||||
}
|
||||
entities[ent_id]["matches"].append(
|
||||
{"start": ent.start_char, "end": ent.end_char, "text": ent.text}
|
||||
)
|
||||
|
||||
res.append({"id": doc_id, "entities": list(entities.values())})
|
||||
return res
|
|
@ -1,2 +1,2 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# Licensed under the MIT License.
|
||||
|
|
|
@ -21,17 +21,8 @@ def test_api():
|
|||
software, which runs on its Echo and Dot devices, have clear leads in
|
||||
consumer adoption."""
|
||||
|
||||
|
||||
request_data = {
|
||||
"values": [
|
||||
{
|
||||
"recordId": "a1",
|
||||
"data": {
|
||||
"text": text,
|
||||
"language": "en"
|
||||
}
|
||||
}
|
||||
]
|
||||
"values": [{"recordId": "a1", "data": {"text": text, "language": "en"}}]
|
||||
}
|
||||
|
||||
response = client.post("/spacy_entities", json=request_data)
|
||||
|
@ -49,5 +40,5 @@ def test_api():
|
|||
"Echo and Dot",
|
||||
"Google",
|
||||
"iPhones",
|
||||
"Siri"
|
||||
"Siri",
|
||||
]
|
||||
|
|
|
@ -1,22 +1,111 @@
|
|||
# Deploy to Azure Kubernetes Service
|
||||
# Build and push image to Azure Container Registry; Deploy to Azure Kubernetes Service
|
||||
# https://docs.microsoft.com/azure/devops/pipelines/languages/docker
|
||||
|
||||
trigger:
|
||||
batch: true
|
||||
branches:
|
||||
include:
|
||||
- master
|
||||
- master
|
||||
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
resources:
|
||||
- repo: self
|
||||
|
||||
steps:
|
||||
- script: |
|
||||
docker build . -t {{cookiecutter.project_slug}}
|
||||
displayName: Build Docker Container
|
||||
- script: |
|
||||
DOCKER_IMAGE=$(docker images -q | sed 1q)
|
||||
docker run -i "$DOCKER_IMAGE" /bin/bash -c "black ./app --check"
|
||||
displayName: Lint with Black
|
||||
continueOnError: true
|
||||
- script: |
|
||||
DOCKER_IMAGE=$(docker images -q | sed 1q)
|
||||
docker run -i -v junit:/usr/src "$DOCKER_IMAGE" /bin/bash -c "pytest app/tests --doctest-modules --junitxml=junit/test-results.xml --cov --cov-report=xml --cov-report=html"
|
||||
displayName: Test with pytest
|
||||
variables:
|
||||
|
||||
# Container registry service connection established during pipeline creation
|
||||
|
||||
# See: https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#sep-docreg
|
||||
dockerRegistryServiceConnection: 'yourDockerRegistryServiceConnectionGUID'
|
||||
imageRepository: 'yourImageRepositoryName'
|
||||
containerRegistry: 'yourContainerRegistryName.azurecr.io'
|
||||
dockerfilePath: '**/Dockerfile'
|
||||
tag: '$(Build.BuildId)'
|
||||
|
||||
# Kubernetes Namespace
|
||||
k8sNamespace: 'default'
|
||||
imagePullSecret: 'yourImagePullSecretName-auth'
|
||||
|
||||
# See: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/environments-kubernetes?view=azure-devops
|
||||
envName: 'yourEnvName-cluster.default'
|
||||
|
||||
# Agent VM image name
|
||||
vmImageName: 'ubuntu-latest'
|
||||
|
||||
stages:
|
||||
- stage: Build
|
||||
displayName: Build stage
|
||||
jobs:
|
||||
- job: Build
|
||||
displayName: Build job
|
||||
pool:
|
||||
vmImage: $(vmImageName)
|
||||
steps:
|
||||
- task: Docker@2
|
||||
displayName: Build image
|
||||
inputs:
|
||||
command: build
|
||||
repository: $(imageRepository)
|
||||
dockerfile: $(dockerfilePath)
|
||||
containerRegistry: $(dockerRegistryServiceConnection)
|
||||
tags: |
|
||||
$(tag)
|
||||
- script: |
|
||||
DOCKER_IMAGE=$(docker images -q | sed 1q)
|
||||
docker run -i "$DOCKER_IMAGE" /bin/bash -c "black ./app --check"
|
||||
displayName: Run black check
|
||||
- script: |
|
||||
DOCKER_IMAGE=$(docker images -q | sed 1q)
|
||||
docker run -i -v junit:/app/junit "$DOCKER_IMAGE" /bin/bash -c "pytest app/tests --doctest-modules --junitxml=junit/test-results.xml --cov --cov-report=xml --cov-report=html"
|
||||
displayName: Test with pytest
|
||||
- task: PublishTestResults@2
|
||||
inputs:
|
||||
testResultsFiles: 'junit/test-results.xml'
|
||||
testRunTitle: 'Publish test results'
|
||||
- task: Docker@2
|
||||
displayName: Push image to container registry
|
||||
inputs:
|
||||
command: buildAndPush
|
||||
repository: $(imageRepository)
|
||||
dockerfile: $(dockerfilePath)
|
||||
containerRegistry: $(dockerRegistryServiceConnection)
|
||||
tags: |
|
||||
$(tag)
|
||||
- task: PublishPipelineArtifact@0
|
||||
inputs:
|
||||
artifactName: 'manifests'
|
||||
targetPath: 'manifests'
|
||||
|
||||
- stage: Deploy_BVT
|
||||
displayName: Deploy BVT
|
||||
dependsOn: Build
|
||||
jobs:
|
||||
- deployment: Deploy_BVT
|
||||
pool:
|
||||
vmImage: $(vmImageName)
|
||||
environment: '$(envName)'
|
||||
strategy:
|
||||
runOnce:
|
||||
deploy:
|
||||
steps:
|
||||
- task: DownloadPipelineArtifact@1
|
||||
inputs:
|
||||
artifactName: 'manifests'
|
||||
downloadPath: '$(System.ArtifactsDirectory)/manifests'
|
||||
|
||||
- task: KubernetesManifest@0
|
||||
displayName: Create imagePullSecret
|
||||
inputs:
|
||||
action: createSecret
|
||||
secretName: $(imagePullSecret)
|
||||
namespace: $(k8sNamespace)
|
||||
dockerRegistryEndpoint: $(dockerRegistryServiceConnection)
|
||||
- task: KubernetesManifest@0
|
||||
displayName: Deploy to Kubernetes cluster
|
||||
inputs:
|
||||
action: deploy
|
||||
namespace: $(k8sNamespace)
|
||||
manifests: |
|
||||
$(System.ArtifactsDirectory)/manifests/deployment.yml
|
||||
$(System.ArtifactsDirectory)/manifests/service.yml
|
||||
imagePullSecrets: |
|
||||
$(imagePullSecret)
|
||||
containers: |
|
||||
$(containerRegistry)/$(imageRepository):$(tag)
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
import uvicorn
|
||||
from app.api import app
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
uvicorn.run(app, host='0.0.0.0', port=8080, log_level='info')
|
|
@ -0,0 +1,33 @@
|
|||
# This Deployment Manifest is not parameterized so you'll need to update the app label,
|
||||
# container image and name in order for your azure pipelines deployment to succeed
|
||||
---
|
||||
apiVersion: apps/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: spacy-extractor
|
||||
spec:
|
||||
replicas: 3
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: spacy-extractor
|
||||
spec:
|
||||
containers:
|
||||
- name: spacy-extractor
|
||||
image: yourContainerRegistryName.azurecr.io/spacyextractor
|
||||
imagePullPolicy: Always
|
||||
resources:
|
||||
requests:
|
||||
memory: 0.1G
|
||||
cpu: 0.1
|
||||
limits:
|
||||
memory: 0.5G
|
||||
cpu: 0.5
|
||||
ports:
|
||||
- name: http-api
|
||||
containerPort: 8080
|
||||
env:
|
||||
- name: CLUSTER_ROUTE_PREFIX
|
||||
value: /spacy/
|
|
@ -0,0 +1,28 @@
|
|||
# This service assumes you're using Ambassador as an API Gateway.
|
||||
# See https://getambassador.io for more info on setting up Ambassador with Kubernetes
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: spacy-extractor
|
||||
annotations:
|
||||
getambassador.io/config: |
|
||||
---
|
||||
apiVersion: ambassador/v1
|
||||
kind: Mapping
|
||||
name: spacy-extractor-mapping
|
||||
prefix: /spacy/
|
||||
service: spacy-extractor
|
||||
timeout_ms: 20000
|
||||
bypass_auth: true
|
||||
spec:
|
||||
selector:
|
||||
app: spacy-extractor
|
||||
ports:
|
||||
- port: 80
|
||||
targetPort: http-api
|
||||
name: http
|
||||
- port: 443
|
||||
targetPort: http-api
|
||||
name: https
|
||||
type: ClusterIP
|
|
@ -1,7 +1,8 @@
|
|||
fastapi
|
||||
requests
|
||||
uvicorn
|
||||
python-dotenv
|
||||
pytest
|
||||
pytest-cov
|
||||
black
|
||||
spacy==2.2.0
|
||||
fastapi==0.42.0
|
||||
requests==2.22.0
|
||||
uvicorn==0.9.1
|
||||
python-dotenv==0.10.3
|
||||
pytest==5.2.1
|
||||
pytest-cov==2.8.1
|
||||
black==19.3b0
|
|
@ -1,2 +0,0 @@
|
|||
spacy
|
||||
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
|
|
@ -1,2 +0,0 @@
|
|||
#!/bin/bash
|
||||
uvicorn app.api:app --host 0.0.0.0 --port ${PORT}
|
Загрузка…
Ссылка в новой задаче