Updating with better deployment to azure devops kubernetes env. Adding better batch inputs that use nlp.pipe

2019-10-21 13:53:31 -07:00 · 2019-10-21 13:53:31 -07:00 · 40e59912fd
--- a/cookiecutter.json
+++ b/cookiecutter.json
@ -1,5 +1,6 @@
 {
    "project_name": "Python Azure Search Cognitive Skill API",
    "project_slug": "{{ cookiecutter.project_name.lower().replace(' ', '_').replace('-', '_') }}",
-    "project_short_description": "Python API that for Custom Cognitive Skills in Azure Search"
+    "project_short_description": "Python API that for Custom Cognitive Skills in Azure Search",
+    "project_language": "This must be one of spaCy's default languages. See https://spacy.io/usage for a supported list."
 }
--- a/{{cookiecutter.project_slug}}/Dockerfile
+++ b/{{cookiecutter.project_slug}}/Dockerfile
@ -1,15 +1,17 @@
-FROM python:3.6
+FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7
 ENV PORT 8080
-EXPOSE ${PORT}
-WORKDIR /usr/src
+ENV APP_MODULE app.api:app
+ENV LOG_LEVEL debug
+ENV WEB_CONCURRENCY 2

+# Install spacy requirments separately first so that Docker will 
+# cache the (somewhat) expensive download of a spacy model
 COPY ./requirements/spacy.txt ./requirements/spacy.txt
 RUN pip install -r requirements/spacy.txt
+RUN spacy download {{cookiecutter.project_language}}

 COPY ./requirements/base.txt ./requirements/base.txt
 RUN pip install -r requirements/base.txt
-COPY ./app ./app

-COPY start.sh .
-
-CMD ["bash", "start.sh"]
+COPY .env /app/.env
+COPY ./app /app/app
--- a/{{cookiecutter.project_slug}}/README.md
+++ b/{{cookiecutter.project_slug}}/README.md
@ -22,7 +22,7 @@ To run locally in debug mode run:

 ```
 cd ./{{cookiecutter.project_slug}}
-uvicorn app.api:app --debug
+uvicorn app.api:app --reload
 ```
 Open your browser to http://localhost:8000/docs to view the SwaggerUI.

--- a/{{cookiecutter.project_slug}}/app/init.py
+++ b/{{cookiecutter.project_slug}}/app/init.py
@ -1,2 +1,2 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
+# Licensed under the MIT License.
--- a/{{cookiecutter.project_slug}}/app/api.py
+++ b/{{cookiecutter.project_slug}}/app/api.py
@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

+from collections import defaultdict
 import os

 from dotenv import load_dotenv, find_dotenv
@ -10,11 +11,17 @@ from starlette.responses import RedirectResponse
 import spacy
 import uvicorn

-from app.models import RecordsRequest, RecordsResponse
+from app.models import (
+    ENT_PROP_MAP,
+    RecordsRequest,
+    RecordsResponse,
+    RecordsEntitiesByTypeResponse,
+)
+from app.spacy_extractor import SpacyExtractor


 load_dotenv(find_dotenv())
-prefix = os.environ.get("CLUSTER_ROUTE_PREFIX")
+prefix = os.getenv("CLUSTER_ROUTE_PREFIX")
 if not prefix:
    prefix = ""
 prefix = prefix.rstrip("/")
@ -24,54 +31,85 @@ app = FastAPI(
    title="{{cookiecutter.project_name}}",
    version="1.0",
    description="{{cookiecutter.project_short_description}}",
-    openapi_prefix=prefix
+    openapi_prefix=prefix,
 )

-nlp = spacy.load('en_core_web_sm')
+nlp = spacy.load("{{cookiecutter.project_language}}")
+extractor = SpacyExtractor(nlp)


-def extract_from_text(text: str):
-    """Extract Spacy Named Entities from raw text"""
-    entities = []
-    for ent in nlp(text).ents:
-        match = {
-            "text": ent.text,
-            "label": ent.label_,
-            "start": ent.start_char,
-            "end": ent.end_char,
-        }
-        entities.append(match)
-    return entities
+ENT_PROP_MAP = {
+    "CARDINAL": "cardinals",
+    "DATE": "dates",
+    "EVENT": "events",
+    "FAC": "facilities",
+    "GPE": "gpes",
+    "LANGUAGE": "languages",
+    "LAW": "laws",
+    "LOC": "locations",
+    "MONEY": "money",
+    "NORP": "norps",
+    "ORDINAL": "ordinals",
+    "ORG": "organizations",
+    "PERCENT": "percentages",
+    "PERSON": "people",
+    "PRODUCT": "products",
+    "QUANTITY": "quanities",
+    "TIME": "times",
+    "WORK_OF_ART": "worksOfArt",
+}


-@app.get('/', include_in_schema=False)
+@app.get("/", include_in_schema=False)
 def docs_redirect():
-    return RedirectResponse(f'{prefix}/docs')
+    return RedirectResponse(f"{prefix}/docs")
+
+
+@app.post("/entities", response_model=RecordsResponse, tags=["NER"])
+async def extract_entities(body: RecordsRequest):
+    """Extract Named Entities from a batch of Records."""
+
+    res = []
+    documents = []
+
+    for val in body.values:
+        documents.append({"id": val.recordId, "text": val.data.text})
+
+    entities_res = extractor.extract_entities(documents)
+    print(entities_res)
+
+    res = [
+        {"recordId": er["id"], "data": {"entities": er["entities"]}}
+        for er in entities_res
+    ]
+
+    return {"values": res}


@app.post(
-    "/spacy_entities",
-    response_model=RecordsResponse,
-    tags=["NER", "Azure Search"],
+    "/entities_by_type", response_model=RecordsEntitiesByTypeResponse, tags=["NER"]
 )
-async def extract_entities(body: RecordsRequest):
-    """Extract Named Entities from a batch of Records.
+async def extract_entities_by_type(body: RecordsRequest):
+    """Extract Named Entities from a batch of Records separated by entity label.
        This route can be used directly as a Cognitive Skill in Azure Search
        For Documentation on integration with Azure Search, see here:
        https://docs.microsoft.com/en-us/azure/search/cognitive-search-custom-skill-interface"""

    res = []
+    documents = []
+
    for val in body.values:
-        ents = set([e["text"] for e in extract_from_text(val.data.text)])
-        ents = sorted(list(ents), key=lambda s: s.lower())
-        res.append(
-            {
-                "recordId": val.recordId,
-                "data": {
-                    "entities": ents
-                }
-            }
-        )
-    return {
-        "values": res
-    }
+        documents.append({"id": val.recordId, "text": val.data.text})
+
+    entities_res = extractor.extract_entities(documents)
+    res = []
+
+    for er in entities_res:
+        groupby = defaultdict(list)
+        for ent in er["entities"]:
+            ent_prop = ENT_PROP_MAP[ent["label"]]
+            groupby[ent_prop].append(ent["name"])
+        record = {"recordId": er["id"], "data": groupby}
+        res.append(record)
+
+    return {"values": res}
--- a/{{cookiecutter.project_slug}}/app/models.py
+++ b/{{cookiecutter.project_slug}}/app/models.py
@ -5,6 +5,28 @@ from typing import Dict, List, Optional
 from pydantic import BaseModel, Schema


+ENT_PROP_MAP = {
+    "CARDINAL": "cardinals",
+    "DATE": "dates",
+    "EVENT": "events",
+    "FAC": "facilities",
+    "GPE": "gpes",
+    "LANGUAGE": "languages",
+    "LAW": "laws",
+    "LOC": "locations",
+    "MONEY": "money",
+    "NORP": "norps",
+    "ORDINAL": "ordinals",
+    "ORG": "organizations",
+    "PERCENT": "percentages",
+    "PERSON": "people",
+    "PRODUCT": "products",
+    "QUANTITY": "quanities",
+    "TIME": "times",
+    "WORK_OF_ART": "worksOfArt",
+}
+
+
 class RecordDataRequest(BaseModel):
    text: str
    language: str = "en"
@ -20,7 +42,7 @@ class RecordsRequest(BaseModel):


 class RecordDataResponse(BaseModel):
-    entities: List[str]
+    entities: List


 class Message(BaseModel):
@ -36,3 +58,12 @@ class RecordResponse(BaseModel):

 class RecordsResponse(BaseModel):
    values: List[RecordResponse]
+
+
+class RecordEntitiesByTypeResponse(BaseModel):
+    recordId: str
+    data: Dict[str, List[str]]
+
+
+class RecordsEntitiesByTypeResponse(BaseModel):
+    values: List[RecordEntitiesByTypeResponse]
--- a/{{cookiecutter.project_slug}}/app/spacy_extractor.py
+++ b/{{cookiecutter.project_slug}}/app/spacy_extractor.py
@ -0,0 +1,74 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from typing import Dict, List
+import spacy
+from spacy.language import Language
+
+
+class SpacyExtractor:
+    """class SpacyExtractor encapsulates logic to pipe Records with an id and text body
+    through a spacy model and return entities separated by Entity Type
+    """
+
+    def __init__(
+        self, nlp: Language, input_id_col: str = "id", input_text_col: str = "text"
+    ):
+        """Initialize the SpacyExtractor pipeline.
+        
+        nlp (spacy.language.Language): pre-loaded spacy language model
+        input_text_col (str): property on each document to run the model on
+        input_id_col (str): property on each document to correlate with request
+
+        RETURNS (EntityRecognizer): The newly constructed object.
+        """
+        self.nlp = nlp
+        self.input_id_col = input_id_col
+        self.input_text_col = input_text_col
+
+    def _name_to_id(self, text: str):
+        """Utility function to do a messy normalization of an entity name
+
+        text (str): text to create "id" from
+        """
+        return "-".join([s.lower() for s in text.split()])
+
+    def extract_entities(self, records: List[Dict[str, str]]):
+        """Apply the pre-trained model to a batch of records
+        
+        records (list): The list of "document" dictionaries each with an
+            `id` and `text` property
+        
+        RETURNS (list): List of responses containing the id of 
+            the correlating document and a list of entities.
+        """
+        ids = (doc[self.input_id_col] for doc in records)
+        texts = (doc[self.input_text_col] for doc in records)
+
+        res = []
+
+        for doc_id, spacy_doc in zip(ids, self.nlp.pipe(texts)):
+            entities = {}
+            for ent in spacy_doc.ents:
+                ent_id = ent.kb_id
+                if not ent_id:
+                    ent_id = ent.ent_id
+                if not ent_id:
+                    ent_id = self._name_to_id(ent.text)
+
+                if ent_id not in entities:
+                    if ent.text.lower() == ent.text:
+                        ent_name = ent.text.capitalize()
+                    else:
+                        ent_name = ent.text
+                    entities[ent_id] = {
+                        "name": ent_name,
+                        "label": ent.label_,
+                        "matches": [],
+                    }
+                entities[ent_id]["matches"].append(
+                    {"start": ent.start_char, "end": ent.end_char, "text": ent.text}
+                )
+
+            res.append({"id": doc_id, "entities": list(entities.values())})
+        return res
--- a/{{cookiecutter.project_slug}}/app/tests/init.py
+++ b/{{cookiecutter.project_slug}}/app/tests/init.py
@ -1,2 +1,2 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
+# Licensed under the MIT License.
--- a/{{cookiecutter.project_slug}}/app/tests/test_api.py
+++ b/{{cookiecutter.project_slug}}/app/tests/test_api.py
@ -21,17 +21,8 @@ def test_api():
    software, which runs on its Echo and Dot devices, have clear leads in
    consumer adoption."""

-
    request_data = {
-        "values": [
-            {
-                "recordId": "a1",
-                "data": {
-                    "text": text,
-                    "language": "en"
-                }
-            }
-        ]
+        "values": [{"recordId": "a1", "data": {"text": text, "language": "en"}}]
    }

    response = client.post("/spacy_entities", json=request_data)
@ -49,5 +40,5 @@ def test_api():
        "Echo and Dot",
        "Google",
        "iPhones",
-        "Siri"
+        "Siri",
    ]
--- a/{{cookiecutter.project_slug}}/azure-pipelines.yml
+++ b/{{cookiecutter.project_slug}}/azure-pipelines.yml
@ -1,22 +1,111 @@
+# Deploy to Azure Kubernetes Service
+# Build and push image to Azure Container Registry; Deploy to Azure Kubernetes Service
+# https://docs.microsoft.com/azure/devops/pipelines/languages/docker
+
 trigger:
-  batch: true
-  branches:
-    include: 
-      - master
+- master

-pool:
-  vmImage: 'ubuntu-16.04'
+resources:
+- repo: self

-steps:
- script: |
-    docker build . -t {{cookiecutter.project_slug}}
-  displayName: Build Docker Container
- script: |
-    DOCKER_IMAGE=$(docker images -q | sed 1q)
-    docker run -i "$DOCKER_IMAGE" /bin/bash -c "black ./app --check"
-  displayName: Lint with Black
-  continueOnError: true
- script: |
-    DOCKER_IMAGE=$(docker images -q | sed 1q)
-    docker run -i -v junit:/usr/src "$DOCKER_IMAGE" /bin/bash -c "pytest app/tests --doctest-modules --junitxml=junit/test-results.xml --cov --cov-report=xml --cov-report=html"
-  displayName: Test with pytest
+variables:
+
+  # Container registry service connection established during pipeline creation
+  
+  # See: https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#sep-docreg
+  dockerRegistryServiceConnection: 'yourDockerRegistryServiceConnectionGUID' 
+  imageRepository: 'yourImageRepositoryName'
+  containerRegistry: 'yourContainerRegistryName.azurecr.io'
+  dockerfilePath: '**/Dockerfile'
+  tag: '$(Build.BuildId)'
+  
+  # Kubernetes Namespace
+  k8sNamespace: 'default'
+  imagePullSecret: 'yourImagePullSecretName-auth'
+
+  # See: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/environments-kubernetes?view=azure-devops
+  envName: 'yourEnvName-cluster.default'
+
+  # Agent VM image name
+  vmImageName: 'ubuntu-latest'
+
+stages:
+- stage: Build
+  displayName: Build stage
+  jobs:  
+  - job: Build
+    displayName: Build job
+    pool:
+      vmImage: $(vmImageName)
+    steps:
+    - task: Docker@2
+      displayName: Build image
+      inputs:
+        command: build
+        repository: $(imageRepository)
+        dockerfile: $(dockerfilePath)
+        containerRegistry: $(dockerRegistryServiceConnection)
+        tags: |
+          $(tag)
+    - script: |
+        DOCKER_IMAGE=$(docker images -q | sed 1q)
+        docker run -i "$DOCKER_IMAGE" /bin/bash -c "black ./app --check"
+      displayName: Run black check
+    - script: |
+        DOCKER_IMAGE=$(docker images -q | sed 1q)
+        docker run -i -v junit:/app/junit "$DOCKER_IMAGE" /bin/bash -c "pytest app/tests --doctest-modules --junitxml=junit/test-results.xml --cov --cov-report=xml --cov-report=html"
+      displayName: Test with pytest
+    - task: PublishTestResults@2
+      inputs:
+        testResultsFiles: 'junit/test-results.xml'
+        testRunTitle: 'Publish test results'
+    - task: Docker@2
+      displayName: Push image to container registry
+      inputs:
+        command: buildAndPush
+        repository: $(imageRepository)
+        dockerfile: $(dockerfilePath)
+        containerRegistry: $(dockerRegistryServiceConnection)
+        tags: |
+          $(tag)
+    - task: PublishPipelineArtifact@0
+      inputs:
+        artifactName: 'manifests'
+        targetPath: 'manifests'
+
+- stage: Deploy_BVT
+  displayName: Deploy BVT
+  dependsOn: Build
+  jobs:
+  - deployment: Deploy_BVT
+    pool:
+      vmImage: $(vmImageName)
+    environment: '$(envName)'
+    strategy:
+      runOnce:
+        deploy:
+          steps:
+          - task: DownloadPipelineArtifact@1
+            inputs:
+              artifactName: 'manifests'
+              downloadPath: '$(System.ArtifactsDirectory)/manifests'
+
+          - task: KubernetesManifest@0
+            displayName: Create imagePullSecret
+            inputs:
+              action: createSecret
+              secretName: $(imagePullSecret)
+              namespace: $(k8sNamespace)
+              dockerRegistryEndpoint: $(dockerRegistryServiceConnection)
+          - task: KubernetesManifest@0
+            displayName: Deploy to Kubernetes cluster
+            inputs:
+              action: deploy
+              namespace: $(k8sNamespace)
+              manifests: |
+                $(System.ArtifactsDirectory)/manifests/deployment.yml
+                $(System.ArtifactsDirectory)/manifests/service.yml
+              imagePullSecrets: |
+                $(imagePullSecret)
+              containers: |
+                $(containerRegistry)/$(imageRepository):$(tag)
--- a/{{cookiecutter.project_slug}}/main.py
+++ b/{{cookiecutter.project_slug}}/main.py
@ -0,0 +1,6 @@
+import uvicorn
+from app.api import app
+
+
+if __name__ == '__main__':
+    uvicorn.run(app, host='0.0.0.0', port=8080, log_level='info')
--- a/{{cookiecutter.project_slug}}/manifests/deployment.yml
+++ b/{{cookiecutter.project_slug}}/manifests/deployment.yml
@ -0,0 +1,33 @@
+# This Deployment Manifest is not parameterized so you'll need to update the app label, 
+# container image and name in order for your azure pipelines deployment to succeed
+---
+apiVersion: apps/v1beta1
+kind: Deployment
+metadata:
+  name: spacy-extractor
+spec:
+  replicas: 3
+  strategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app: spacy-extractor
+    spec:
+      containers:
+      - name: spacy-extractor
+        image: yourContainerRegistryName.azurecr.io/spacyextractor
+        imagePullPolicy: Always
+        resources:
+          requests:
+            memory: 0.1G
+            cpu: 0.1
+          limits:
+            memory: 0.5G
+            cpu: 0.5
+        ports:
+        - name: http-api
+          containerPort: 8080
+        env:
+        - name: CLUSTER_ROUTE_PREFIX
+          value: /spacy/
--- a/{{cookiecutter.project_slug}}/manifests/service.yml
+++ b/{{cookiecutter.project_slug}}/manifests/service.yml
@ -0,0 +1,28 @@
+# This service assumes you're using Ambassador as an API Gateway.
+# See https://getambassador.io for more info on setting up Ambassador with Kubernetes
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: spacy-extractor
+  annotations:
+    getambassador.io/config: |
+      ---
+      apiVersion: ambassador/v1
+      kind: Mapping
+      name: spacy-extractor-mapping
+      prefix: /spacy/
+      service: spacy-extractor
+      timeout_ms: 20000
+      bypass_auth: true
+spec:
+  selector:
+    app: spacy-extractor
+  ports:
+    - port: 80
+      targetPort: http-api
+      name: http
+    - port: 443
+      targetPort: http-api
+      name: https
+  type: ClusterIP
--- a/{{cookiecutter.project_slug}}/requirements/base.txt
+++ b/{{cookiecutter.project_slug}}/requirements/base.txt
@ -1,7 +1,8 @@
-fastapi
-requests
-uvicorn
-python-dotenv
-pytest
-pytest-cov
-black
+spacy==2.2.0
+fastapi==0.42.0
+requests==2.22.0
+uvicorn==0.9.1
+python-dotenv==0.10.3
+pytest==5.2.1
+pytest-cov==2.8.1
+black==19.3b0
--- a/{{cookiecutter.project_slug}}/requirements/spacy.txt
+++ b/{{cookiecutter.project_slug}}/requirements/spacy.txt
@ -1,2 +0,0 @@
-spacy
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
--- a/{{cookiecutter.project_slug}}/start.sh
+++ b/{{cookiecutter.project_slug}}/start.sh
@ -1,2 +0,0 @@
-#!/bin/bash
-uvicorn app.api:app --host 0.0.0.0 --port ${PORT}