This commit is contained in:
Kabir Khan 2019-04-05 10:30:59 -07:00
Коммит 3cae5bb603
26 изменённых файлов: 571 добавлений и 0 удалений

1
.funcignore Normal file
Просмотреть файл

@ -0,0 +1 @@
.env

44
.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,44 @@
bin
obj
csx
.vs
edge
Publish
*.user
*.suo
*.cscfg
*.Cache
project.lock.json
/packages
/TestResults
/tools/NuGet.exe
/App_Data
/secrets
/data
.secrets
appsettings.json
local.settings.json
node_modules
dist
# Local python packages
.python_packages/
# Python Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
functionapp_skills.zip

6
.vscode/extensions.json поставляемый Normal file
Просмотреть файл

@ -0,0 +1,6 @@
{
"recommendations": [
"ms-azuretools.vscode-azurefunctions",
"ms-python.python"
]
}

12
.vscode/launch.json поставляемый Normal file
Просмотреть файл

@ -0,0 +1,12 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Attach to Python Functions",
"type": "python",
"request": "attach",
"port": 9091,
"preLaunchTask": "func: host start"
}
]
}

13
.vscode/settings.json поставляемый Normal file
Просмотреть файл

@ -0,0 +1,13 @@
{
"azureFunctions.projectRuntime": "~2",
"azureFunctions.projectLanguage": "Python",
"azureFunctions.deploySubpath": "functionapp_skills.zip",
"azureFunctions.preDeployTask": "func: pack --build-native-deps",
"files.exclude": {
"obj": true,
"bin": true
},
"azureFunctions.pythonVenv": ".env",
"debug.internalConsoleOptions": "neverOpen",
"python.pythonPath": ".env\\Scripts\\python.exe"
}

32
.vscode/tasks.json поставляемый Normal file
Просмотреть файл

@ -0,0 +1,32 @@
{
"version": "2.0.0",
"tasks": [
{
"type": "func",
"command": "host start",
"problemMatcher": "$func-watch",
"isBackground": true,
"dependsOn": "func: extensions install"
},
{
"type": "func",
"command": "extensions install",
"dependsOn": "pipInstall",
"problemMatcher": []
},
{
"label": "pipInstall",
"type": "shell",
"osx": {
"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
},
"windows": {
"command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt"
},
"linux": {
"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
},
"problemMatcher": []
}
]
}

6
Dockerfile Normal file
Просмотреть файл

@ -0,0 +1,6 @@
FROM mcr.microsoft.com/azure-functions/python:2.0
COPY . /home/site/wwwroot
RUN cd /home/site/wwwroot && \
pip install -r requirements.txt

5
README.md Normal file
Просмотреть файл

@ -0,0 +1,5 @@
# Azure Functions Skills Extractor
## Follow instructions here
https://docs.microsoft.com/en-us/azure/azure-functions/functions-create-function-linux-custom-image#create-a-resource-group

0
_data/.gitkeep Normal file
Просмотреть файл

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,14 @@
{
"documents":[
{
"id": "a1",
"text": "Machine learning (ML) is a field of computer science that uses statistical techniques to give computer systems the ability to \"learn\" Evolved from the study of pattern recognition and computational learning theory in artificial intelligence, machine learning explores the study and construction of algorithms that can learn from and make predictions on data",
"language": "en"
},
{
"id": "a2",
"text": "Some skills text like pattern recognition and algorithms and nlp",
"language": "en"
}
]
}

1
_data/skills.json Normal file

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

1
_data/sources.json Normal file
Просмотреть файл

@ -0,0 +1 @@
["Stackshare Skill", "Github Topics", "Stackshare", "Microsoft Academic Graph", "Stackshare Skills"]

24
azure-pipelines.yml Normal file
Просмотреть файл

@ -0,0 +1,24 @@
pool:
vmImage: 'Ubuntu 16.04'
steps:
- task: NodeTool@0
inputs:
versionSpec: '8.x'
- script: |
set -e
echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ wheezy main" | sudo tee /etc/apt/sources.list.d/azure-cli.list
curl -L https://packages.microsoft.com/keys/microsoft.asc | sudo apt-key add -
sudo apt-get install -y apt-transport-https
echo "install Azure CLI..."
sudo apt-get update && sudo apt-get install -y azure-cli
npm i -g azure-functions-core-tools --unsafe-perm true
echo "installing dotnet core"
curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin --channel 2.0
- script: |
set -e
az login --service-principal --username "$(APP_ID)" --password "$(PASSWORD)" --tenant "$(TENANT_ID)"
func settings add FUNCTIONS_WORKER_RUNTIME python
func extensions install
func azure functionapp publish $(APP_NAME) --build-native-deps

Просмотреть файл

@ -0,0 +1,73 @@
import logging
import json
import asyncio
import azure.functions as func
import spacy
from spacy.lang.en import English
from .models import *
from ..services.skills import SkillsExtractor
nlp = English()
skills_extractor = SkillsExtractor(nlp)
async def extract_from_text(text: str):
"""Extract skills from raw text"""
skills = skills_extractor.extract_skills(text)
skills_list = []
for skill_id, skill_info in skills.items():
skills_list.append(
{
"id": skill_id,
"standardizedName": skill_info["name"]
}
)
return skills_list
async def extract_from_doc(doc, skill_property='id'):
"""Extract Skills from a single Document"""
skills = await extract_from_text(doc.data.text)
return {
"recordId": doc.recordId,
"data": {
"skills": [s[skill_property] for s in skills]
},
"warnings": None,
"errors": None
}
async def main(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
skill_property = req.params.get('skill_property', 'id')
try:
body = AzureSearchDocumentsRequest(**req.get_json())
logging.info(body)
except ValueError:
return func.HttpResponse(
"Please pass a valid request body",
status_code=400
)
if body:
response_headers = {
'Content-Type': 'application/json'
}
results = []
for doc in body.values:
result = extract_from_doc(doc, skill_property=skill_property)
results.append(result)
res = await asyncio.gather(*results)
values_res = {
'values': res
}
return func.HttpResponse(json.dumps(values_res), headers=response_headers)

Просмотреть файл

@ -0,0 +1,20 @@
{
"scriptFile": "__init__.py",
"bindings": [
{
"authLevel": "function",
"type": "httpTrigger",
"direction": "in",
"name": "req",
"route": "azure_cognitive_search",
"methods": [
"post"
]
},
{
"type": "http",
"direction": "out",
"name": "$return"
}
]
}

Просмотреть файл

@ -0,0 +1,37 @@
from enum import Enum
from typing import List, Optional
from pydantic import BaseModel, Schema
class SkillProperty(str, Enum):
id = "id"
name = "standardizedName"
# Azure Search Cognitive Skills Models
class AzureSearchDocumentDataRequest(BaseModel):
text: str
language: str = "en"
class AzureSearchDocumentRequest(BaseModel):
recordId: str
data: AzureSearchDocumentDataRequest
class AzureSearchDocumentsRequest(BaseModel):
values: List[AzureSearchDocumentRequest]
class AzureSearchDocumentDataResponse(BaseModel):
skills: List[str]
class AzureSearchDocumentResponse(BaseModel):
recordId: str
data: AzureSearchDocumentDataResponse
errors: Optional[List[str]]
warnings: Optional[List[str]]
class AzureSearchDocumentsResponse(BaseModel):
values: List[AzureSearchDocumentResponse]

Просмотреть файл

@ -0,0 +1,10 @@
{
"values": [
{
"recordId": "a1",
"data": {
"text": "Be part of the next revolution in computing! Join Microsofts Quantum Architecture and Computation team and work on the software that will be used to control quantum computers. You will join the team that is developing the software to acquire, store, secure, share, process and visualize data across the quantum program. You will work with physicists, hardware designers, software developers, and researchers to help make quantum computing a reality. Responsibilities This role is responsible for developing, deploying, maintaining and supporting several Azure services to serve the data needs of various facets of the quantum program. You will be part of a cross-functional distributed team with a broad range of skills. We work in a collaborative, agile, somewhat skunkworks fashion, and you will be expected to bring a can-do attitude, serious development skills, and a big-picture view to the team, so that you round out the teams capabilities. You will need to have significant experience in developing production services for the cloud, first rate knowledge of Azures core capabilities, a good understanding of agile workflows and tools, and experience with dev-ops in a production environment. We work with WebApi, Microsoft Orleans, Cosmos DB, and Kubernetes on the back-end – you will ideally know these technologies, or least be extremely comfortable with all the concepts around scalable distributed computing, data storage and deployment. You will also need to have experience with modern JavaScript-based web interfaces – ideally with a modern framework like React; and have some experience with Electron applications. We use Azure DevOps for CI/CD and you will ideally be very comfortable with that. We program in C#, Python, Rust, F#, and C++ so bonus-points if you can tick off more than a two of those (and add some more!) Our team is globally distributed between Copenhagen, Delft, and Sydney. You will need to be willing to travel periodically to Europe or the US to have regular face-time with your teammates. Qualifications 8 or more years of programming experience5 or more years of .NET programming experience3 or more years of Cloud Computing (preferably Azure)Undergraduate degree or equivalent in computer science, engineering or mathematicsPrior experience with quantum computing or quantum physics isnt required; this is a great opportunity for you to expand your knowledge and learn from some of the luminaries in these areas. Ability to meet Microsoft, customer and/or government security screening requirements are required for this role. These requirements include, but are not limited to the following specialized security screenings: Microsoft Cloud Background Check: This position will be required to pass the Microsoft Cloud Background Check upon hire/transfer and every two years thereafter. Microsoft is an equal opportunity employer. All qualified applicants will receive consideration for employment without regard to age, ancestry, color, family or medical care leave, gender identity or expression, genetic information, marital status, medical condition, national origin, physical or mental disability, political affiliation, protected veteran status, race, religion, sex (including pregnancy), sexual orientation, or any other characteristic protected by applicable laws, regulations and ordinances. We also consider qualified applicants regardless of criminal histories, consistent with legal requirements. If you need assistance and/or a reasonable accommodation due to a disability during the application or the recruiting process, please send a request via the Accommodation request form. Benefits/perks listed below may vary depending on the nature of your employment with Microsoft and the country where you work."
}
}
]
}

10
extensions.csproj Normal file
Просмотреть файл

@ -0,0 +1,10 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<WarningsAsErrors></WarningsAsErrors>
<DefaultItemExcludes>**</DefaultItemExcludes>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Azure.WebJobs.Script.ExtensionsMetadataGenerator" Version="1.0.1" />
</ItemGroup>
</Project>

29
get_skill/__init__.py Normal file
Просмотреть файл

@ -0,0 +1,29 @@
import os
import logging
import json
import azure.functions as func
with open('_data/skills.json') as skills_file:
skills = json.load(skills_file)
def main(req: func.HttpRequest) -> func.HttpResponse:
skill_id = req.route_params.get('skill_id')
logging.info(f'Fetching skill by id {skill_id}')
if skill_id:
if skill_id not in skills:
res = func.HttpResponse(
f"Not Found: Skill with id {skill_id} does not exist",
status_code=404
)
else:
res = func.HttpResponse(json.dumps(skills[skill_id]))
else:
res = func.HttpResponse(
"Please pass a skill_id on the query string or in the request body",
status_code=400
)
return res

20
get_skill/function.json Normal file
Просмотреть файл

@ -0,0 +1,20 @@
{
"scriptFile": "__init__.py",
"bindings": [
{
"authLevel": "function",
"type": "httpTrigger",
"direction": "in",
"name": "req",
"methods": [
"get"
],
"route": "skills/{skill_id}"
},
{
"type": "http",
"direction": "out",
"name": "$return"
}
]
}

3
get_skill/sample.dat Normal file
Просмотреть файл

@ -0,0 +1,3 @@
{
"name": "Azure"
}

3
host.json Normal file
Просмотреть файл

@ -0,0 +1,3 @@
{
"version": "2.0"
}

2
requirements.txt Normal file
Просмотреть файл

@ -0,0 +1,2 @@
spacy
pydantic

0
services/__init__.py Normal file
Просмотреть файл

187
services/skills.py Normal file
Просмотреть файл

@ -0,0 +1,187 @@
from collections import defaultdict
import json
import os
from spacy.tokens import Span
from spacy.matcher import Matcher
from spacy.lang.en.stop_words import STOP_WORDS
Span.set_extension("skill_id", default=None, force=True)
Span.set_extension("skill_standardized_name", default=None, force=True)
Span.set_extension("skill_standardized_description", default=None, force=True)
Span.set_extension("skill_synonyms", default=None, force=True)
Span.set_extension("skill_related_skills", default=None, force=True)
Span.set_extension("skill_sources", default=None, force=True)
class SkillMatcher:
"""Spacy pipeline component for matching skills"""
def __init__(self, nlp, name, skills, entity_label):
self.nlp = nlp
self.name = name
self.skills = skills
nlp.vocab.strings.add(entity_label)
self.entity_label_id = nlp.vocab.strings[entity_label]
patterns = self.build_patterns(skills)
self.matcher = self.build_matcher(patterns)
def __call__(self, doc):
"""Identify skill matches, add custom extension attributes to spans,
and append the skill match spans to the doc.ents"""
matches = list(self.matcher(doc))
matches = set(
[(m_id, start, end) for m_id, start, end in matches if start != end]
)
matches = sorted(matches, key=self.get_sort_key, reverse=True)
entities = list(doc.ents)
new_entities = []
seen_tokens = set()
for match_id, start, end in matches:
# check for end - 1 here because boundaries are inclusive
if start not in seen_tokens and end - 1 not in seen_tokens:
span = Span(doc, start, end, label=self.entity_label_id)
skill_id = self.nlp.vocab.strings[match_id]
sources = self.skills[skill_id]
main_source = sources[0]
for source in sources:
if source["sourceName"] == "Github Topics":
main_source = source
break
elif source["sourceName"] == "Microsoft Academic Topics":
main_source = source
break
elif source["sourceName"] == "Stackshare Skills":
main_source = source
break
for source in sources:
if "displayName" in source:
span._.skill_id = skill_id
span._.skill_standardized_name = main_source["displayName"]
span._.skill_standardized_description = main_source[
"shortDescription"
]
span._.skill_sources = [
{"name": s["sourceName"], "url": s["url"]} for s in sources
]
new_entities.append(span)
entities = [
e for e in entities if not (e.start < end and e.end > start)
]
seen_tokens.update(range(start, end))
doc.ents = entities + new_entities
return doc
def get_sort_key(self, m):
"""Used to disambiguate overlapping entities"""
return (m[2] - m[1], m[1])
def skill_pattern(self, skill, split_token=None):
"""Create a single skill pattern"""
pattern = []
if split_token:
split = skill.split(split_token)
else:
split = skill.split()
for b in split:
if b.upper() == skill:
pattern.append({"ORTH": b})
else:
pattern.append({"LOWER": b.lower()})
return pattern
def build_patterns(self, skills):
"""Build up lists of spacy token patterns for matcher"""
patterns = defaultdict(list)
split_tokens = [".", "/"]
special_case_synonyms = {
"algorithm": ["algorithms"],
"artificial-intelligence": ["ai", "AI"],
"machine-learning": ["ml", "ML"],
"natural-language-processing": ["nlp", "NLP"],
}
for skill_id, sources in skills.items():
skill_names = set()
if skill_id in special_case_synonyms:
for syn in special_case_synonyms[skill_id]:
skill_names.add(syn)
for source in sources:
if "displayName" in source:
skill_names.add(source["displayName"])
for name in skill_names:
if name.upper() == name:
skill_name = name
else:
skill_name = name.lower().strip()
if skill_name not in STOP_WORDS:
pattern = self.skill_pattern(skill_name)
if pattern:
patterns[skill_id].append(pattern)
for t in split_tokens:
if t in skill_name:
patterns[skill_id].append(
(self.skill_pattern(skill_name, t))
)
return patterns
def build_matcher(self, patterns):
"""Build rule-based token matcher for skills"""
matcher = Matcher(self.nlp.vocab)
for skill_id, patterns in patterns.items():
if patterns:
matcher.add(skill_id, None, *patterns)
return matcher
class SkillsExtractor:
"""Extracts skills from text"""
def __init__(self, nlp):
self.nlp = nlp
with open("_data/skills.json") as skills_file:
self.skills = json.load(skills_file)
skill_matcher = SkillMatcher(self.nlp, "skill_matcher", self.skills, "SKILL")
if not self.nlp.has_pipe(skill_matcher.name):
self.nlp.add_pipe(skill_matcher)
def extract_skills(self, text):
"""Extract skills from text using the
Spacy Matcher API for custom Skills Patterns"""
doc = self.nlp(text)
found_skills = defaultdict(lambda: defaultdict(list))
for ent in doc.ents:
if ent.label_ == "SKILL":
found_skills[ent._.skill_id]["matches"].append(
{
"start": ent.start_char,
"end": ent.end_char,
"label": ent.label_,
"text": ent.text,
}
)
if "name" not in found_skills[ent._.skill_id]:
found_skills[ent._.skill_id]["name"] = ent._.skill_standardized_name
if "description" not in found_skills[ent._.skill_id]:
found_skills[ent._.skill_id][
"description"
] = ent._.skill_standardized_description
if "sources" not in found_skills[ent._.skill_id]:
found_skills[ent._.skill_id]["sources"] = ent._.skill_sources
return found_skills