Initial Commit

2019-04-05 10:30:59 -07:00 · 2019-04-05 10:30:59 -07:00 · 3cae5bb603
--- a/.funcignore
+++ b/.funcignore
@ -0,0 +1 @@
+.env
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,44 @@
+bin
+obj
+csx
+.vs
+edge
+Publish
+
+*.user
+*.suo
+*.cscfg
+*.Cache
+project.lock.json
+
+/packages
+/TestResults
+
+/tools/NuGet.exe
+/App_Data
+/secrets
+/data
+.secrets
+appsettings.json
+local.settings.json
+
+node_modules
+dist
+
+# Local python packages
+.python_packages/
+
+# Python Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+functionapp_skills.zip
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@ -0,0 +1,6 @@
+{
+  "recommendations": [
+    "ms-azuretools.vscode-azurefunctions",
+    "ms-python.python"
+  ]
+}
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,12 @@
+{
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Attach to Python Functions",
+      "type": "python",
+      "request": "attach",
+      "port": 9091,
+      "preLaunchTask": "func: host start"
+    }
+  ]
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,13 @@
+{
+  "azureFunctions.projectRuntime": "~2",
+  "azureFunctions.projectLanguage": "Python",
+  "azureFunctions.deploySubpath": "functionapp_skills.zip",
+  "azureFunctions.preDeployTask": "func: pack --build-native-deps",
+  "files.exclude": {
+    "obj": true,
+    "bin": true
+  },
+  "azureFunctions.pythonVenv": ".env",
+  "debug.internalConsoleOptions": "neverOpen",
+  "python.pythonPath": ".env\\Scripts\\python.exe"
+}
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@ -0,0 +1,32 @@
+{
+  "version": "2.0.0",
+  "tasks": [
+    {
+      "type": "func",
+      "command": "host start",
+      "problemMatcher": "$func-watch",
+      "isBackground": true,
+      "dependsOn": "func: extensions install"
+    },
+    {
+      "type": "func",
+      "command": "extensions install",
+      "dependsOn": "pipInstall",
+      "problemMatcher": []
+    },
+    {
+      "label": "pipInstall",
+      "type": "shell",
+      "osx": {
+        "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
+      },
+      "windows": {
+        "command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt"
+      },
+      "linux": {
+        "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
+      },
+      "problemMatcher": []
+    }
+  ]
+}
--- a/6
+++ b/6
@ -0,0 +1,6 @@
+FROM mcr.microsoft.com/azure-functions/python:2.0
+
+COPY . /home/site/wwwroot
+
+RUN cd /home/site/wwwroot && \
+    pip install -r requirements.txt
--- a/README.md
+++ b/README.md
@ -0,0 +1,5 @@
+# Azure Functions Skills Extractor
+
+
+## Follow instructions here
+https://docs.microsoft.com/en-us/azure/azure-functions/functions-create-function-linux-custom-image#create-a-resource-group
--- a/_data/.gitkeep
+++ b/_data/.gitkeep
--- a/_data/example_azure_search_cognitive_skill.json
+++ b/_data/example_azure_search_cognitive_skill.json
--- a/_data/example_batch_request.json
+++ b/_data/example_batch_request.json
@ -0,0 +1,14 @@
+{
+	"documents":[
+		{
+			"id": "a1",
+			"text": "Machine learning (ML) is a field of computer science that uses statistical techniques to give computer systems the ability to \"learn\" Evolved from the study of pattern recognition and computational learning theory in artificial intelligence, machine learning explores the study and construction of algorithms that can learn from and make predictions on data",
+			"language": "en"
+		},
+		{
+			"id": "a2",
+			"text": "Some skills text like pattern recognition and algorithms and nlp",
+			"language": "en"
+		}
+	]
+}
--- a/_data/skills.json
+++ b/_data/skills.json
--- a/_data/sources.json
+++ b/_data/sources.json
@ -0,0 +1 @@
+["Stackshare Skill", "Github Topics", "Stackshare", "Microsoft Academic Graph", "Stackshare Skills"]
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -0,0 +1,24 @@
+pool:
+  vmImage: 'Ubuntu 16.04'
+
+steps:
+- task: NodeTool@0
+  inputs:
+    versionSpec: '8.x'
+
+- script: |
+    set -e
+    echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ wheezy main" | sudo tee /etc/apt/sources.list.d/azure-cli.list
+    curl -L https://packages.microsoft.com/keys/microsoft.asc | sudo apt-key add -
+    sudo apt-get install -y apt-transport-https
+    echo "install Azure CLI..."
+    sudo apt-get update && sudo apt-get install -y azure-cli
+    npm i -g azure-functions-core-tools --unsafe-perm true
+    echo "installing dotnet core"
+    curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin --channel 2.0
+- script: |
+    set -e
+    az login --service-principal --username "$(APP_ID)" --password "$(PASSWORD)" --tenant "$(TENANT_ID)" 
+    func settings add FUNCTIONS_WORKER_RUNTIME python
+    func extensions install
+    func azure functionapp publish $(APP_NAME) --build-native-deps
--- a/azure_cognitive_search/init.py
+++ b/azure_cognitive_search/init.py
@ -0,0 +1,73 @@
+import logging
+import json
+import asyncio
+import azure.functions as func
+import spacy
+from spacy.lang.en import English
+
+from .models import *
+from ..services.skills import SkillsExtractor
+
+nlp = English()
+skills_extractor = SkillsExtractor(nlp)
+
+
+async def extract_from_text(text: str):
+    """Extract skills from raw text"""
+    skills = skills_extractor.extract_skills(text)
+    skills_list = []
+
+    for skill_id, skill_info in skills.items():
+        skills_list.append(
+            {
+                "id": skill_id,
+                "standardizedName": skill_info["name"]
+            }
+        )
+    return skills_list
+
+
+async def extract_from_doc(doc, skill_property='id'):
+    """Extract Skills from a single Document"""
+
+    skills = await extract_from_text(doc.data.text)
+    return {
+        "recordId": doc.recordId,
+        "data": {
+            "skills": [s[skill_property] for s in skills]
+        },
+        "warnings": None,
+        "errors": None
+    }
+
+
+async def main(req: func.HttpRequest) -> func.HttpResponse:
+    logging.info('Python HTTP trigger function processed a request.')
+
+    skill_property = req.params.get('skill_property', 'id')
+    
+    try:
+        body = AzureSearchDocumentsRequest(**req.get_json())
+        logging.info(body)
+    except ValueError:
+        return func.HttpResponse(
+             "Please pass a valid request body",
+             status_code=400
+        )
+
+    if body:
+        response_headers = {
+            'Content-Type': 'application/json'
+        }
+        results = []
+        for doc in body.values:
+            result = extract_from_doc(doc, skill_property=skill_property)
+            results.append(result)
+        
+        res = await asyncio.gather(*results)
+
+        values_res = {
+            'values': res
+        }
+
+        return func.HttpResponse(json.dumps(values_res), headers=response_headers)
--- a/azure_cognitive_search/function.json
+++ b/azure_cognitive_search/function.json
@ -0,0 +1,20 @@
+{
+  "scriptFile": "__init__.py",
+  "bindings": [
+    {
+      "authLevel": "function",
+      "type": "httpTrigger",
+      "direction": "in",
+      "name": "req",
+      "route": "azure_cognitive_search",
+      "methods": [
+        "post"
+      ]
+    },
+    {
+      "type": "http",
+      "direction": "out",
+      "name": "$return"
+    }
+  ]
+}
--- a/azure_cognitive_search/models.py
+++ b/azure_cognitive_search/models.py
@ -0,0 +1,37 @@
+from enum import Enum
+from typing import List, Optional
+from pydantic import BaseModel, Schema
+
+
+class SkillProperty(str, Enum):
+    id = "id"
+    name = "standardizedName"
+
+# Azure Search Cognitive Skills Models
+class AzureSearchDocumentDataRequest(BaseModel):
+    text: str
+    language: str = "en"
+
+
+class AzureSearchDocumentRequest(BaseModel):
+    recordId: str
+    data: AzureSearchDocumentDataRequest
+
+
+class AzureSearchDocumentsRequest(BaseModel):
+    values: List[AzureSearchDocumentRequest]
+
+
+class AzureSearchDocumentDataResponse(BaseModel):
+    skills: List[str]
+
+
+class AzureSearchDocumentResponse(BaseModel):
+    recordId: str
+    data: AzureSearchDocumentDataResponse
+    errors: Optional[List[str]]
+    warnings: Optional[List[str]]
+
+
+class AzureSearchDocumentsResponse(BaseModel):
+    values: List[AzureSearchDocumentResponse]
--- a/azure_cognitive_search/sample.dat
+++ b/azure_cognitive_search/sample.dat
@ -0,0 +1,10 @@
+{
+   "values": [
+        {
+        	"recordId": "a1",
+        	"data": {
+	               "text": "Be part of the next revolution in computing! Join Microsoft’s Quantum Architecture and Computation team and work on the software that will be used to control quantum computers. You will join the team that is developing the software to acquire, store, secure, share, process and visualize data across the quantum program. You will work with physicists, hardware designers, software developers, and researchers to help make quantum computing a reality. Responsibilities This role is responsible for developing, deploying, maintaining and supporting several Azure services to serve the data needs of various facets of the quantum program. You will be part of a cross-functional distributed team with a broad range of skills. We work in a collaborative, agile, somewhat skunkworks fashion, and you will be expected to bring a can-do attitude, serious development skills, and a big-picture view to the team, so that you round out the team’s capabilities. You will need to have significant experience in developing production services for the cloud, first rate knowledge of Azure’s core capabilities, a good understanding of agile workflows and tools, and experience with dev-ops in a production environment. We work with WebApi, Microsoft Orleans, Cosmos DB, and Kubernetes on the back-end – you will ideally know these technologies, or least be extremely comfortable with all the concepts around scalable distributed computing, data storage and deployment. You will also need to have experience with modern JavaScript-based web interfaces – ideally with a modern framework like React; and have some experience with Electron applications. We use Azure DevOps for CI/CD and you will ideally be very comfortable with that. We program in C#, Python, Rust, F#, and C++ so bonus-points if you can tick off more than a two of those (and add some more!) Our team is globally distributed between Copenhagen, Delft, and Sydney. You will need to be willing to travel periodically to Europe or the US to have regular face-time with your teammates. Qualifications 8 or more years of programming experience5 or more years of .NET programming experience3 or more years of Cloud Computing (preferably Azure)Undergraduate degree or equivalent in computer science, engineering or mathematicsPrior experience with quantum computing or quantum physics isn’t required; this is a great opportunity for you to expand your knowledge and learn from some of the luminaries in these areas. Ability to meet Microsoft, customer and/or government security screening requirements are required for this role. These requirements include, but are not limited to the following specialized security screenings: Microsoft Cloud Background Check: This position will be required to pass the Microsoft Cloud Background Check upon hire/transfer and every two years thereafter. Microsoft is an equal opportunity employer. All qualified applicants will receive consideration for employment without regard to age, ancestry, color, family or medical care leave, gender identity or expression, genetic information, marital status, medical condition, national origin, physical or mental disability, political affiliation, protected veteran status, race, religion, sex (including pregnancy), sexual orientation, or any other characteristic protected by applicable laws, regulations and ordinances. We also consider qualified applicants regardless of criminal histories, consistent with legal requirements. If you need assistance and/or a reasonable accommodation due to a disability during the application or the recruiting process, please send a request via the Accommodation request form. Benefits/perks listed below may vary depending on the nature of your employment with Microsoft and the country where you work."
+	        }
+        }
+   ]
+}
--- a/extensions.csproj
+++ b/extensions.csproj
@ -0,0 +1,10 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+	<WarningsAsErrors></WarningsAsErrors>
+	<DefaultItemExcludes>**</DefaultItemExcludes>
+  </PropertyGroup>
+  <ItemGroup>
+    <PackageReference Include="Microsoft.Azure.WebJobs.Script.ExtensionsMetadataGenerator" Version="1.0.1" />
+  </ItemGroup>
+</Project>
--- a/get_skill/init.py
+++ b/get_skill/init.py
@ -0,0 +1,29 @@
+import os
+import logging
+import json
+
+import azure.functions as func
+
+
+with open('_data/skills.json') as skills_file:
+    skills = json.load(skills_file)
+
+def main(req: func.HttpRequest) -> func.HttpResponse:
+    skill_id = req.route_params.get('skill_id')
+    logging.info(f'Fetching skill by id {skill_id}')
+
+    if skill_id:
+        if skill_id not in skills:
+            res = func.HttpResponse(
+                f"Not Found: Skill with id {skill_id} does not exist",
+                status_code=404
+            )
+        else:
+            res = func.HttpResponse(json.dumps(skills[skill_id]))
+    else:
+        res = func.HttpResponse(
+            "Please pass a skill_id on the query string or in the request body",
+            status_code=400
+        )
+    
+    return res
--- a/get_skill/function.json
+++ b/get_skill/function.json
@ -0,0 +1,20 @@
+{
+  "scriptFile": "__init__.py",
+  "bindings": [
+    {
+      "authLevel": "function",
+      "type": "httpTrigger",
+      "direction": "in",
+      "name": "req",
+      "methods": [
+        "get"
+      ],
+      "route": "skills/{skill_id}"
+    },
+    {
+      "type": "http",
+      "direction": "out",
+      "name": "$return"
+    }
+  ]
+}
--- a/get_skill/sample.dat
+++ b/get_skill/sample.dat
@ -0,0 +1,3 @@
+{
+    "name": "Azure"
+}
--- a/host.json
+++ b/host.json
@ -0,0 +1,3 @@
+{
+    "version":  "2.0"
+}
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+spacy
+pydantic
--- a/services/init.py
+++ b/services/init.py
--- a/services/skills.py
+++ b/services/skills.py
@ -0,0 +1,187 @@
+from collections import defaultdict
+import json
+import os
+
+from spacy.tokens import Span
+from spacy.matcher import Matcher
+from spacy.lang.en.stop_words import STOP_WORDS
+
+
+Span.set_extension("skill_id", default=None, force=True)
+Span.set_extension("skill_standardized_name", default=None, force=True)
+Span.set_extension("skill_standardized_description", default=None, force=True)
+Span.set_extension("skill_synonyms", default=None, force=True)
+Span.set_extension("skill_related_skills", default=None, force=True)
+Span.set_extension("skill_sources", default=None, force=True)
+
+
+class SkillMatcher:
+    """Spacy pipeline component for matching skills"""
+
+    def __init__(self, nlp, name, skills, entity_label):
+        self.nlp = nlp
+        self.name = name
+        self.skills = skills
+        nlp.vocab.strings.add(entity_label)
+        self.entity_label_id = nlp.vocab.strings[entity_label]
+        patterns = self.build_patterns(skills)
+        self.matcher = self.build_matcher(patterns)
+
+    def __call__(self, doc):
+        """Identify skill matches, add custom extension attributes to spans,
+        and append the skill match spans to the doc.ents"""
+        matches = list(self.matcher(doc))
+        matches = set(
+            [(m_id, start, end) for m_id, start, end in matches if start != end]
+        )
+
+        matches = sorted(matches, key=self.get_sort_key, reverse=True)
+        entities = list(doc.ents)
+        new_entities = []
+        seen_tokens = set()
+
+        for match_id, start, end in matches:
+            # check for end - 1 here because boundaries are inclusive
+            if start not in seen_tokens and end - 1 not in seen_tokens:
+                span = Span(doc, start, end, label=self.entity_label_id)
+                skill_id = self.nlp.vocab.strings[match_id]
+
+                sources = self.skills[skill_id]
+
+                main_source = sources[0]
+                for source in sources:
+                    if source["sourceName"] == "Github Topics":
+                        main_source = source
+                        break
+                    elif source["sourceName"] == "Microsoft Academic Topics":
+                        main_source = source
+                        break
+                    elif source["sourceName"] == "Stackshare Skills":
+                        main_source = source
+                        break
+                for source in sources:
+                    if "displayName" in source:
+                        span._.skill_id = skill_id
+                        span._.skill_standardized_name = main_source["displayName"]
+                        span._.skill_standardized_description = main_source[
+                            "shortDescription"
+                        ]
+
+                span._.skill_sources = [
+                    {"name": s["sourceName"], "url": s["url"]} for s in sources
+                ]
+                new_entities.append(span)
+
+                entities = [
+                    e for e in entities if not (e.start < end and e.end > start)
+                ]
+                seen_tokens.update(range(start, end))
+        doc.ents = entities + new_entities
+        return doc
+
+    def get_sort_key(self, m):
+        """Used to disambiguate overlapping entities"""
+        return (m[2] - m[1], m[1])
+
+    def skill_pattern(self, skill, split_token=None):
+        """Create a single skill pattern"""
+        pattern = []
+        if split_token:
+            split = skill.split(split_token)
+        else:
+            split = skill.split()
+        for b in split:
+            if b.upper() == skill:
+                pattern.append({"ORTH": b})
+            else:
+                pattern.append({"LOWER": b.lower()})
+
+        return pattern
+
+    def build_patterns(self, skills):
+        """Build up lists of spacy token patterns for matcher"""
+        patterns = defaultdict(list)
+        split_tokens = [".", "/"]
+        special_case_synonyms = {
+            "algorithm": ["algorithms"],
+            "artificial-intelligence": ["ai", "AI"],
+            "machine-learning": ["ml", "ML"],
+            "natural-language-processing": ["nlp", "NLP"],
+        }
+
+        for skill_id, sources in skills.items():
+            skill_names = set()
+            if skill_id in special_case_synonyms:
+                for syn in special_case_synonyms[skill_id]:
+                    skill_names.add(syn)
+            for source in sources:
+                if "displayName" in source:
+                    skill_names.add(source["displayName"])
+
+            for name in skill_names:
+                if name.upper() == name:
+                    skill_name = name
+                else:
+                    skill_name = name.lower().strip()
+
+                if skill_name not in STOP_WORDS:
+                    pattern = self.skill_pattern(skill_name)
+
+                    if pattern:
+                        patterns[skill_id].append(pattern)
+
+                        for t in split_tokens:
+                            if t in skill_name:
+                                patterns[skill_id].append(
+                                    (self.skill_pattern(skill_name, t))
+                                )
+        return patterns
+
+    def build_matcher(self, patterns):
+        """Build rule-based token matcher for skills"""
+        matcher = Matcher(self.nlp.vocab)
+        for skill_id, patterns in patterns.items():
+            if patterns:
+                matcher.add(skill_id, None, *patterns)
+        return matcher
+
+
+class SkillsExtractor:
+    """Extracts skills from text"""
+
+    def __init__(self, nlp):
+        self.nlp = nlp
+
+        with open("_data/skills.json") as skills_file:
+            self.skills = json.load(skills_file)
+
+        skill_matcher = SkillMatcher(self.nlp, "skill_matcher", self.skills, "SKILL")
+        if not self.nlp.has_pipe(skill_matcher.name):
+            self.nlp.add_pipe(skill_matcher)
+
+    def extract_skills(self, text):
+        """Extract skills from text using the 
+        Spacy Matcher API for custom Skills Patterns"""
+        doc = self.nlp(text)
+        found_skills = defaultdict(lambda: defaultdict(list))
+
+        for ent in doc.ents:
+            if ent.label_ == "SKILL":
+                found_skills[ent._.skill_id]["matches"].append(
+                    {
+                        "start": ent.start_char,
+                        "end": ent.end_char,
+                        "label": ent.label_,
+                        "text": ent.text,
+                    }
+                )
+                if "name" not in found_skills[ent._.skill_id]:
+                    found_skills[ent._.skill_id]["name"] = ent._.skill_standardized_name
+                if "description" not in found_skills[ent._.skill_id]:
+                    found_skills[ent._.skill_id][
+                        "description"
+                    ] = ent._.skill_standardized_description
+                if "sources" not in found_skills[ent._.skill_id]:
+                    found_skills[ent._.skill_id]["sources"] = ent._.skill_sources
+
+        return found_skills
				`@ -0,0 +1 @@`
				`["Stackshare Skill", "Github Topics", "Stackshare", "Microsoft Academic Graph", "Stackshare Skills"]`