158 строки
6.4 KiB
Python
158 строки
6.4 KiB
Python
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
# Licensed under the MIT License.
|
|
|
|
from collections import defaultdict
|
|
import itertools
|
|
from pathlib import Path
|
|
|
|
import srsly
|
|
from spacy.lang.en.stop_words import STOP_WORDS
|
|
from spacy.language import Language
|
|
from spacy.pipeline import EntityRuler
|
|
|
|
|
|
class SkillsExtractor:
|
|
"""Extracts skills from text using SpaCy's EntityRuler Component"""
|
|
|
|
def __init__(self, nlp: Language, data_path: Path = Path("data")):
|
|
self.nlp = nlp
|
|
self.data_path = data_path
|
|
self.skills = self._get_skills()
|
|
|
|
patterns = self._build_patterns(self.skills)
|
|
extra_patterns = self._get_extra_skill_patterns()
|
|
ruler = EntityRuler(nlp, overwrite_ents=True)
|
|
ruler.add_patterns(itertools.chain(patterns, extra_patterns))
|
|
if not self.nlp.has_pipe("skills_ruler"):
|
|
self.nlp.add_pipe(ruler, name="skills_ruler")
|
|
|
|
def _get_skills(self):
|
|
"""Query skills from skills collection"""
|
|
skills_path = self.data_path/"skills.json"
|
|
skills = srsly.read_json(skills_path)
|
|
return skills
|
|
|
|
def _get_extra_skill_patterns(self):
|
|
"""Load extra user added skill patterns"""
|
|
extra_patterns_path = self.data_path/"extra_skill_patterns.jsonl"
|
|
extra_skill_patterns = srsly.read_jsonl(extra_patterns_path)
|
|
return extra_skill_patterns
|
|
|
|
def _skill_pattern(self, skill: str, split_token: str = None):
|
|
"""Create a single skill pattern"""
|
|
pattern = []
|
|
if split_token:
|
|
split = skill.split(split_token)
|
|
else:
|
|
split = skill.split()
|
|
for b in split:
|
|
if b:
|
|
if b.upper() == skill:
|
|
pattern.append({"TEXT": b})
|
|
else:
|
|
pattern.append({"LOWER": b.lower()})
|
|
|
|
return pattern
|
|
|
|
def _build_patterns(self, skills: list, create: bool = False):
|
|
"""Build all matcher patterns"""
|
|
patterns_path = self.data_path/"skill_patterns.jsonl"
|
|
if not patterns_path.exists() or create:
|
|
"""Build up lists of spacy token patterns for matcher"""
|
|
patterns = []
|
|
split_tokens = [".", "/", "-"]
|
|
|
|
for skill_id, skill_info in skills.items():
|
|
aliases = skill_info['aliases']
|
|
sources = skill_info['sources']
|
|
skill_names = set()
|
|
for al in aliases:
|
|
skill_names.add(al)
|
|
for source in sources:
|
|
if "displayName" in source:
|
|
skill_names.add(source["displayName"])
|
|
|
|
for name in skill_names:
|
|
if name.upper() == name:
|
|
skill_name = name
|
|
else:
|
|
skill_name = name.lower().strip()
|
|
|
|
if skill_name not in STOP_WORDS:
|
|
pattern = self._skill_pattern(skill_name)
|
|
|
|
if pattern:
|
|
label = f"SKILL|{skill_id}"
|
|
patterns.append({"label": label, "pattern": pattern})
|
|
|
|
for t in split_tokens:
|
|
if t in skill_name:
|
|
patterns.append(
|
|
{
|
|
"label": label,
|
|
"pattern": self._skill_pattern(
|
|
skill_name, t
|
|
),
|
|
}
|
|
)
|
|
|
|
srsly.write_jsonl(patterns_path, patterns)
|
|
return patterns
|
|
else:
|
|
patterns = srsly.read_jsonl(patterns_path)
|
|
return patterns
|
|
|
|
def extract_skills(self, text: str):
|
|
"""Extract skills from text unstructured text"""
|
|
doc = self.nlp(text)
|
|
found_skills = defaultdict(lambda: defaultdict(list))
|
|
|
|
for ent in doc.ents:
|
|
if "|" in ent.label_:
|
|
ent_label, skill_id = ent.label_.split("|")
|
|
if ent_label == "SKILL" and skill_id:
|
|
found_skills[skill_id]["matches"].append(
|
|
{
|
|
"start": ent.start_char,
|
|
"end": ent.end_char,
|
|
"label": ent_label,
|
|
"text": ent.text,
|
|
}
|
|
)
|
|
try:
|
|
skill_info = self.skills[skill_id]
|
|
sources = skill_info['sources']
|
|
|
|
# Some sources have better Skill Descriptions than others.
|
|
# This is a simple heuristic for cascading through the sources
|
|
# to pick the best description available per skill
|
|
main_source = sources[0]
|
|
for source in sources:
|
|
if source["sourceName"] == "Github Topics":
|
|
main_source = source
|
|
break
|
|
elif source["sourceName"] == "Microsoft Academic Topics":
|
|
main_source = source
|
|
break
|
|
elif source["sourceName"] == "Stackshare Skills":
|
|
main_source = source
|
|
break
|
|
except KeyError:
|
|
# This happens when a pattern defined in data/extra_skill_patterns.jsonl
|
|
# is matched. The skill is not added to data/skills.json so there's no
|
|
# extra metadata about the skill from an established source.
|
|
sources = []
|
|
main_source = {
|
|
"displayName": ent.text,
|
|
"shortDescription": "",
|
|
"longDescription": ""
|
|
}
|
|
|
|
keys = ["displayName", "shortDescription", "longDescription"]
|
|
for k in keys:
|
|
found_skills[skill_id][k] = main_source[k]
|
|
found_skills[skill_id]["sources"] = [
|
|
{"name": s["sourceName"], "url": s["url"]} for s in sources
|
|
]
|
|
return found_skills
|