type corrections in recon.v1.fix_annotations

This commit is contained in:
Kabir Khan 2020-06-25 13:25:41 -07:00
Родитель 655a8796d9
Коммит 2b783308c4
7 изменённых файлов: 210 добавлений и 86 удалений

Просмотреть файл

@ -4,15 +4,7 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.\n"
]
}
],
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2"
@ -20,20 +12,9 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"text\": \"Have you used the new version of my model?\", \"spans\": [{\"start\": 36, \"end\": 41, \"token_start\": 8, \"token_end\": 8, \"label\": \"SKILL\"}], \"tokens\": [{\"text\": \"Have\", \"start\": 0, \"end\": 4, \"id\": 0}, {\"text\": \"you\", \"start\": 5, \"end\": 8, \"id\": 1}, {\"text\": \"used\", \"start\": 9, \"end\": 13, \"id\": 2}, {\"text\": \"the\", \"start\": 14, \"end\": 17, \"id\": 3}, {\"text\": \"new\", \"start\": 18, \"end\": 21, \"id\": 4}, {\"text\": \"version\", \"start\": 22, \"end\": 29, \"id\": 5}, {\"text\": \"of\", \"start\": 30, \"end\": 32, \"id\": 6}, {\"text\": \"my\", \"start\": 33, \"end\": 35, \"id\": 7}, {\"text\": \"model\", \"start\": 36, \"end\": 41, \"id\": 8}, {\"text\": \"?\", \"start\": 41, \"end\": 42, \"id\": 9}]},\n",
"{\"text\": \"I'd like to work as an actor or model if possible.\", \"spans\": [{\"text\": \"actor\", \"start\": 23, \"end\": 28, \"token_start\": 7, \"token_end\": 7, \"label\": \"JOB_ROLE\"}, {\"text\": \"model\", \"start\": 32, \"end\": 37, \"token_start\": 9, \"token_end\": 9, \"label\": \"JOB_ROLE\"}], \"tokens\": [{\"text\": \"I\", \"start\": 0, \"end\": 1, \"id\": 0}, {\"text\": \"'d\", \"start\": 1, \"end\": 3, \"id\": 1}, {\"text\": \"like\", \"start\": 4, \"end\": 8, \"id\": 2}, {\"text\": \"to\", \"start\": 9, \"end\": 11, \"id\": 3}, {\"text\": \"work\", \"start\": 12, \"end\": 16, \"id\": 4}, {\"text\": \"as\", \"start\": 17, \"end\": 19, \"id\": 5}, {\"text\": \"an\", \"start\": 20, \"end\": 22, \"id\": 6}, {\"text\": \"actor\", \"start\": 23, \"end\": 28, \"id\": 7}, {\"text\": \"or\", \"start\": 29, \"end\": 31, \"id\": 8}, {\"text\": \"model\", \"start\": 32, \"end\": 37, \"id\": 9}, {\"text\": \"if\", \"start\": 38, \"end\": 40, \"id\": 10}, {\"text\": \"possible\", \"start\": 41, \"end\": 49, \"id\": 11}, {\"text\": \".\", \"start\": 49, \"end\": 50, \"id\": 12}]},\n",
"{\"text\": \"We are looking for a Software Development Engineer who has solid coding skills, a strong machine learning background, and is passionate about developing new AI products.\", \"tokens\": [{\"text\": \"We\", \"start\": 0, \"end\": 2, \"id\": 0}, {\"text\": \"are\", \"start\": 3, \"end\": 6, \"id\": 1}, {\"text\": \"looking\", \"start\": 7, \"end\": 14, \"id\": 2}, {\"text\": \"for\", \"start\": 15, \"end\": 18, \"id\": 3}, {\"text\": \"a\", \"start\": 19, \"end\": 20, \"id\": 4}, {\"text\": \"Software\", \"start\": 21, \"end\": 29, \"id\": 5}, {\"text\": \"Development\", \"start\": 30, \"end\": 41, \"id\": 6}, {\"text\": \"Engineer\", \"start\": 42, \"end\": 50, \"id\": 7}, {\"text\": \"who\", \"start\": 51, \"end\": 54, \"id\": 8}, {\"text\": \"has\", \"start\": 55, \"end\": 58, \"id\": 9}, {\"text\": \"solid\", \"start\": 59, \"end\": 64, \"id\": 10}, {\"text\": \"coding\", \"start\": 65, \"end\": 71, \"id\": 11}, {\"text\": \"skills\", \"start\": 72, \"end\": 78, \"id\": 12}, {\"text\": \",\", \"start\": 78, \"end\": 79, \"id\": 13}, {\"text\": \"a\", \"start\": 80, \"end\": 81, \"id\": 14}, {\"text\": \"strong\", \"start\": 82, \"end\": 88, \"id\": 15}, {\"text\": \"machine\", \"start\": 89, \"end\": 96, \"id\": 16}, {\"text\": \"learning\", \"start\": 97, \"end\": 105, \"id\": 17}, {\"text\": \"background\", \"start\": 106, \"end\": 116, \"id\": 18}, {\"text\": \",\", \"start\": 116, \"end\": 117, \"id\": 19}, {\"text\": \"and\", \"start\": 118, \"end\": 121, \"id\": 20}, {\"text\": \"is\", \"start\": 122, \"end\": 124, \"id\": 21}, {\"text\": \"passionate\", \"start\": 125, \"end\": 135, \"id\": 22}, {\"text\": \"about\", \"start\": 136, \"end\": 141, \"id\": 23}, {\"text\": \"developing\", \"start\": 142, \"end\": 152, \"id\": 24}, {\"text\": \"new\", \"start\": 153, \"end\": 156, \"id\": 25}, {\"text\": \"AI\", \"start\": 157, \"end\": 159, \"id\": 26}, {\"text\": \"products\", \"start\": 160, \"end\": 168, \"id\": 27}, {\"text\": \".\", \"start\": 168, \"end\": 169, \"id\": 28}], \"spans\": [{\"start\": 21, \"end\": 50, \"token_start\": 5, \"token_end\": 7, \"label\": \"SKILL\"}, {\"start\": 65, \"end\": 71, \"token_start\": 11, \"token_end\": 11, \"label\": \"SKILL\"}, {\"start\": 89, \"end\": 105, \"token_start\": 16, \"token_end\": 17, \"label\": \"SKILL\"}, {\"start\": 142, \"end\": 152, \"token_start\": 24, \"token_end\": 24, \"label\": \"SKILL\"}, {\"start\": 157, \"end\": 159, \"token_start\": 26, \"token_end\": 26, \"label\": \"SKILL\"}]},\n",
"{\"text\": \"Responsibilities As a SOFTWARE DEVELOPMENT ENGINEER II you will work / collaborate with other talented engineers to build features and technologies that will affect millions of your fellow developers in the community.\", \"tokens\": [{\"text\": \"Responsibilities\", \"start\": 0, \"end\": 16, \"id\": 0}, {\"text\": \"As\", \"start\": 17, \"end\": 19, \"id\": 1}, {\"text\": \"a\", \"start\": 20, \"end\": 21, \"id\": 2}, {\"text\": \"SOFTWARE\", \"start\": 22, \"end\": 30, \"id\": 3}, {\"text\": \"DEVELOPMENT\", \"start\": 31, \"end\": 42, \"id\": 4}, {\"text\": \"ENGINEER\", \"start\": 43, \"end\": 51, \"id\": 5}, {\"text\": \"II\", \"start\": 52, \"end\": 54, \"id\": 6}, {\"text\": \"you\", \"start\": 55, \"end\": 58, \"id\": 7}, {\"text\": \"will\", \"start\": 59, \"end\": 63, \"id\": 8}, {\"text\": \"work\", \"start\": 64, \"end\": 68, \"id\": 9}, {\"text\": \"/\", \"start\": 69, \"end\": 70, \"id\": 10}, {\"text\": \"collaborate\", \"start\": 71, \"end\": 82, \"id\": 11}, {\"text\": \"with\", \"start\": 83, \"end\": 87, \"id\": 12}, {\"text\": \"other\", \"start\": 88, \"end\": 93, \"id\": 13}, {\"text\": \"talented\", \"start\": 94, \"end\": 102, \"id\": 14}, {\"text\": \"engineers\", \"start\": 103, \"end\": 112, \"id\": 15}, {\"text\": \"to\", \"start\": 113, \"end\": 115, \"id\": 16}, {\"text\": \"build\", \"start\": 116, \"end\": 121, \"id\": 17}, {\"text\": \"features\", \"start\": 122, \"end\": 130, \"id\": 18}, {\"text\": \"and\", \"start\": 131, \"end\": 134, \"id\": 19}, {\"text\": \"technologies\", \"start\": 135, \"end\": 147, \"id\": 20}, {\"text\": \"that\", \"start\": 148, \"end\": 152, \"id\": 21}, {\"text\": \"will\", \"start\": 153, \"end\": 157, \"id\": 22}, {\"text\": \"affect\", \"start\": 158, \"end\": 164, \"id\": 23}, {\"text\": \"millions\", \"start\": 165, \"end\": 173, \"id\": 24}, {\"text\": \"of\", \"start\": 174, \"end\": 176, \"id\": 25}, {\"text\": \"your\", \"start\": 177, \"end\": 181, \"id\": 26}, {\"text\": \"fellow\", \"start\": 182, \"end\": 188, \"id\": 27}, {\"text\": \"developers\", \"start\": 189, \"end\": 199, \"id\": 28}, {\"text\": \"in\", \"start\": 200, \"end\": 202, \"id\": 29}, {\"text\": \"the\", \"start\": 203, \"end\": 206, \"id\": 30}, {\"text\": \"community\", \"start\": 207, \"end\": 216, \"id\": 31}, {\"text\": \".\", \"start\": 216, \"end\": 217, \"id\": 32}], \"spans\": [{\"start\": 22, \"end\": 51, \"token_start\": 3, \"token_end\": 5, \"label\": \"JOB_ROLE\"}, {\"start\": 71, \"end\": 82, \"token_start\": 11, \"token_end\": 11, \"label\": \"SKILL\"}, {\"start\": 103, \"end\": 112, \"token_start\": 15, \"token_end\": 15, \"label\": \"JOB_ROLE\"}, {\"start\": 135, \"end\": 147, \"token_start\": 20, \"token_end\": 20, \"label\": \"SKILL\"}, {\"start\": 189, \"end\": 199, \"token_start\": 28, \"token_end\": 28, \"label\": \"JOB_ROLE\"}]},\n"
]
}
],
"outputs": [],
"source": [
"raw_examples = [\n",
" {\"text\":\"Have you used the new version of my model?\",\"spans\":[{\"start\":36,\"end\":41,\"token_start\":8,\"token_end\":8,\"label\":\"SKILL\"}],\"_input_hash\":1798863398,\"_task_hash\":1273875979,\"tokens\":[{\"text\":\"Have\",\"start\":0,\"end\":4,\"id\":0},{\"text\":\"you\",\"start\":5,\"end\":8,\"id\":1},{\"text\":\"used\",\"start\":9,\"end\":13,\"id\":2},{\"text\":\"the\",\"start\":14,\"end\":17,\"id\":3},{\"text\":\"new\",\"start\":18,\"end\":21,\"id\":4},{\"text\":\"version\",\"start\":22,\"end\":29,\"id\":5},{\"text\":\"of\",\"start\":30,\"end\":32,\"id\":6},{\"text\":\"my\",\"start\":33,\"end\":35,\"id\":7},{\"text\":\"model\",\"start\":36,\"end\":41,\"id\":8},{\"text\":\"?\",\"start\":41,\"end\":42,\"id\":9}],\"_session_id\":None,\"_view_id\":\"ner_manual\",\"answer\":\"accept\"},\n",
@ -62,14 +43,14 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"import copy\n",
"from pprint import pprint\n",
"from pathlib import Path\n",
"from typing import Dict, List, Set, Tuple\n",
"from typing import Any, Dict, List, Set, Tuple\n",
"import spacy\n",
"import srsly\n",
"# import recon\n",
@ -78,7 +59,7 @@
"from recon.corrections import fix_annotations\n",
"from recon.dataset import Dataset\n",
"from recon.loaders import read_jsonl\n",
"from recon.types import Example, PredictionError, HardestExample, NERStats, EntityCoverageStats, EntityCoverage, Transformation, TransformationType, OperationState\n",
"from recon.types import Correction, Example, PredictionError, HardestExample, NERStats, EntityCoverageStats, EntityCoverage, Transformation, TransformationType, OperationState\n",
"from recon.stats import (\n",
" get_ner_stats, get_entity_coverage, get_sorted_type_counts, get_probs_from_counts, entropy,\n",
" calculate_entity_coverage_entropy, calculate_label_balance_entropy, calculate_label_distribution_similarity,\n",
@ -99,17 +80,24 @@
{
"data": {
"text/plain": [
"OrderedDict([('rename_labels', <recon.operations.Operation at 0x7f2d7478ea58>),\n",
" ('fix_annotations',\n",
" <recon.operations.Operation at 0x7f2d7478ea90>),\n",
" ('strip_annotations',\n",
" <recon.operations.Operation at 0x7f2d7478eac8>),\n",
" ('fix_tokenization_and_spacing',\n",
" <recon.operations.Operation at 0x7f2d26453320>),\n",
" ('add_tokens', <recon.operations.Operation at 0x7f2d26453198>),\n",
" ('upcase_labels', <recon.operations.Operation at 0x7f2d264533c8>),\n",
" ('filter_overlaps',\n",
" <recon.operations.Operation at 0x7f2d26453358>)])"
"OrderedDict([('recon.v1.rename_labels',\n",
" <recon.operations.Operation at 0x7fc69fd2e940>),\n",
" ('recon.v1.fix_annotations',\n",
" <recon.operations.Operation at 0x7fc69fd2e978>),\n",
" ('recon.v1.strip_annotations',\n",
" <recon.operations.Operation at 0x7fc69cf8cc50>),\n",
" ('recon.v1.split_sentences',\n",
" <recon.operations.Operation at 0x7fc6d95a6c18>),\n",
" ('recon.v1.fix_tokenization_and_spacing',\n",
" <recon.operations.Operation at 0x7fc691de3ef0>),\n",
" ('recon.v1.add_tokens',\n",
" <recon.operations.Operation at 0x7fc691de3e48>),\n",
" ('recon.v1.upcase_labels',\n",
" <recon.operations.Operation at 0x7fc691de3f60>),\n",
" ('recon.v1.filter_overlaps',\n",
" <recon.operations.Operation at 0x7fc691de3f98>),\n",
" ('recon.v1.prodigy.merge_examples',\n",
" <recon.operations.Operation at 0x7fc6900ccbe0>)])"
]
},
"execution_count": 4,
@ -157,7 +145,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@ -445,21 +433,9 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'Corpus' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-11-ca27ac4abad7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcorpus2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCorpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_disk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'./fixed_data/skills'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'Corpus' is not defined"
]
}
],
"outputs": [],
"source": [
"corpus2 = Corpus.from_disk('./fixed_data/skills')"
]
@ -504,6 +480,94 @@
"corpus2.apply(get_label_disparities, \"SKILL\", \"JOB_ROLE\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 1. 2. 3. 4. 5.]\n",
" [ 6. 7. 8. 9. 10.]\n",
" [11. 12. 13. 14. 15.]\n",
" [16. 17. 18. 19. 20.]\n",
" [21. 22. 23. 24. 25.]]\n"
]
},
{
"data": {
"text/plain": [
"array([[ True, False, True, False, True],\n",
" [False, True, False, True, False],\n",
" [ True, False, True, False, True],\n",
" [False, True, False, True, False],\n",
" [ True, False, True, False, True]])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
" \n",
"# Create a 5 x 5 ndarray with consecutive integers from 1 to 25 (inclusive).\n",
"# Afterwards use Boolean indexing to pick out only the odd numbers in the array\n",
" \n",
"# Create a 5 x 5 ndarray with consecutive integers from 1 to 25 (inclusive).\n",
"X = np.linspace(1,25,25).reshape(5,5)\n",
"print(X)\n",
" \n",
"# Use Boolean indexing to pick out only the odd numbers in the array\n",
"X * X % 2 == 1\n",
"# Y = X[]\n",
"# print(Y)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Correction(annotation='model', from_label='PRODUCT', to_label='SKILL')]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def corrections_from_dict(corrections_dict: Dict[str, Any]):\n",
" corrections: List[Correction] = []\n",
" for key, val in corrections_dict.items():\n",
" if isinstance(val, str):\n",
" from_label = \"ANY\"\n",
" to_label = val\n",
" elif isinstance(val, tuple):\n",
" from_label = val[0]\n",
" to_label = val[1]\n",
" else:\n",
" raise ValueError(\"Cannot parse corrections dict. Value must be either a str of the label \" +\n",
" \"to change the annotation to (TO_LABEL) or a tuple of (FROM_LABEL, TO_LABEL)\")\n",
" corrections.append(Correction(\n",
" annotation=key,\n",
" from_label=from_label,\n",
" to_label=to_label\n",
" ))\n",
" return corrections\n",
"\n",
"corrections_from_dict({\n",
" \"model\": (\"PRODUCT\", \"SKILL\"),\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 10,
@ -522,6 +586,9 @@
}
],
"source": [
"corrections = [\n",
" Correction(text=\"\")\n",
"]\n",
"corpus2.apply_(fix_annotations, {\"model\": \"SKILL\"})"
]
},

Просмотреть файл

@ -28,8 +28,8 @@ classifiers = [
"License :: OSI Approved :: MIT License"
]
requires = [
"pydantic == 1.4",
"spacy >= 2.2.1, <3.0.0",
"pydantic == 1.5",
"spacy >= 2.3.0, <3.0.0",
"scipy",
"typer >= 0.0.8",
"tqdm"

Просмотреть файл

@ -6,10 +6,11 @@ from typing import Any, DefaultDict, Dict, List
import spacy
from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan
from wasabi import msg
from .operations import operation
from .preprocess import SpacyPreProcessor
from .types import Example, Span, Token, TransformationCallbacks
from .types import Correction, Example, Span, Token, TransformationCallbacks
@operation("recon.v1.rename_labels")
@ -30,7 +31,7 @@ def rename_labels(example: Example, label_map: Dict[str, str]) -> Example:
@operation("recon.v1.fix_annotations")
def fix_annotations(
example: Example, corrections: Dict[str, str], case_sensitive: bool = False
example: Example, corrections: List[Correction], case_sensitive: bool = False, dryrun: bool = False
) -> Example:
"""Fix annotations in a copy of List[Example] data.
@ -43,28 +44,35 @@ def fix_annotations(
corrections (Dict[str, str]): Dictionary of corrections mapping entity text to a new label.
If the value is set to None, the annotation will be removed
case_sensitive (bool, optional): Consider case of text for each correction
dryrun (bool, optional): Treat corrections as a dryrun and just print all changes to be made
Returns:
Example: Example with fixed annotations
"""
if case_sensitive:
corrections = {t: l for t, l in corrections.items()}
else:
corrections = {t.lower(): l for t, l in corrections.items()}
if not case_sensitive:
for c in corrections:
c.text = c.text.lower()
prints: DefaultDict[str, List[str]] = defaultdict(list)
corrections_map: Dict[str, Correction] = {c.annotation: c for c in corrections}
prints: List[str] = []
ents_to_remove = []
ents_to_remove: List[int] = []
for i, s in enumerate(example.spans):
t = s.text if case_sensitive else s.text.lower()
if t in corrections:
if corrections[t] is print:
prints[t] += [("=" * 100), example.text, s.label]
elif corrections[t] is None:
ents_to_remove.append(i)
else:
s.label = corrections[t]
if t in corrections_map:
c = corrections_map[t]
if c.to_label is None and s.label == c.from_label:
if dryrun:
prints.append(f"Deleting span: {s.text}")
else:
ents_to_remove.append(i)
elif s.label == c.from_label:
if dryrun:
prints.append(f"Correction span: {s.text} from label: {c.from_label} to label: {c.to_label}")
else:
s.label = c.to_label
i = len(ents_to_remove) - 1
while i >= 0:
@ -72,14 +80,55 @@ def fix_annotations(
del example.spans[idx]
i -= 1
for k in sorted(prints):
print(f"**{k}**")
for line in prints[k]:
print(line)
msg.divider("Example Text")
msg.text(example.text)
for line in prints:
msg.text(line)
return example
def corrections_from_dict(corrections_dict: Dict[str, Any]):
"""Create a list of Correction objects from a simpler config for
corrections using a Dict representation mapping keys to either the label to
convert to or a tuple of (from_label, to_label) pairings or (List[from_labels], to_label)
pairings if you want to convert as subset of labels at a time
Args:
corrections_dict (Dict[str, Any]): Corrections formatted dict
e.g. {
"united states": "GPE",
"London": (["LOC"], "GPE")
}
Raises:
ValueError: If the format of the dict
Returns:
[type]: [description]
"""
corrections: List[Correction] = []
for key, val in corrections_dict.items():
if isinstance(val, str):
from_label = "ANY"
to_label = val
elif isinstance(val, tuple):
if isinstance(val[0], str):
from_labels = [val[0]]
else:
from_labels = val[0]
to_label = val[1]
else:
raise ValueError("Cannot parse corrections dict. Value must be either a str of the label " +
"to change the annotation to (TO_LABEL) or a tuple of (FROM_LABEL, TO_LABEL)")
corrections.append(Correction(
annotation=key,
from_labels=from_labels,
to_label=to_label
))
return corrections
@operation("recon.v1.strip_annotations")
def strip_annotations(
example: Example, strip_chars: List[str] = [".", "!", "?", "-", ":", " "]

Просмотреть файл

@ -193,6 +193,8 @@ class Dataset:
data = loader_func(path)
self.data = data
for example in self.data:
self.example_store.add(example)
if ds_op_state and self.commit_hash != ds_op_state.commit:
# Dataset changed, examples added

Просмотреть файл

@ -23,7 +23,7 @@ from .types import (
)
def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDict[str, List[str]]:
def get_ents_by_label(data: List[Example], case_sensitive: bool = False) -> DefaultDict[str, List[str]]:
"""Get a dictionary of unique text spans by label for your data
# TODO: Ok so this needs to return more than just a set for each label.
@ -39,7 +39,7 @@ def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDic
Args:
data (List[Example]): List of examples
use_lower (bool, optional): Use the lowercase form of the span text.
case_sensitive (bool, optional): Consider case of text for each annotation
sort_by (SortBy): Sort by text or by count
Returns:
@ -51,7 +51,7 @@ def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDic
for e in data:
for s in e.spans:
span_text = s.text.lower() if use_lower else s.text
span_text = s.text if case_sensitive else s.text.lower()
annotations[s.label].add(span_text)
for label, anns in annotations.items():
@ -61,7 +61,7 @@ def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDic
def get_label_disparities(
data: List[Example], label1: str, label2: str, use_lower: bool = True
data: List[Example], label1: str, label2: str, case_sensitive: bool = False
) -> Set[str]:
"""Identify annotated spans that have different labels in different examples
@ -69,24 +69,24 @@ def get_label_disparities(
data (List[Example]): Input List of examples
label1 (str): First label to compare
label2 (str): Second label to compare
use_lower (bool, optional): Use the lowercase form of the span text in ents_to_label.
case_sensitive (bool, optional): Consider case of text for each annotation
Returns:
Set[str]: Set of all unique text spans that overlap between label1 and label2
"""
annotations = get_ents_by_label(data, use_lower=use_lower)
annotations = get_ents_by_label(data, case_sensitive=case_sensitive)
return set(annotations[label1]).intersection(set(annotations[label2]))
def top_label_disparities(
data: List[Example], use_lower: bool = True, dedupe: bool = False
data: List[Example], case_sensitive: bool = False, dedupe: bool = False
) -> List[LabelDisparity]:
"""Identify annotated spans that have different labels
in different examples for all label pairs in data.
Args:
data (List[Example]): Input List of examples
use_lower (bool, optional): Use the lowercase form of the span text in ents_to_label.
case_sensitive (bool, optional): Consider case of text for each annotation
dedupe (bool, optional): Whether to deduplicate for table view vs confusion matrix.
False by default for easy confusion matrix display.
@ -94,7 +94,7 @@ def top_label_disparities(
List[LabelDisparity]: List of LabelDisparity objects for each label pair combination
sorted by the number of disparities between them.
"""
annotations = get_ents_by_label(data, use_lower=use_lower)
annotations = get_ents_by_label(data, case_sensitive=case_sensitive)
label_disparities = {}
for label1 in annotations.keys():
for label2 in annotations.keys():

Просмотреть файл

@ -112,7 +112,7 @@ def calculate_label_distribution_similarity(x: List[Example], y: List[Example])
def get_entity_coverage(
data: List[Example], sep: str = "||", use_lower: bool = True, return_examples: bool = False,
data: List[Example], sep: str = "||", case_sensitive: bool = False, return_examples: bool = False,
) -> List[EntityCoverage]:
"""Identify how well you dataset covers an entity type. Get insights
on the how many times certain text/label span combinations exist across your
@ -123,7 +123,7 @@ def get_entity_coverage(
data (List[Example]): List of examples
sep (str, optional): Separator used in coverage map, only change if || exists in your text
or label.
use_lower (bool, optional): Use the lowercase form of the span text in ents_to_label.
case_sensitive (bool, optional): Consider case of text for each annotation
return_examples (bool, optional): Return Examples that contain the entity label annotation.
Returns:
@ -135,9 +135,7 @@ def get_entity_coverage(
for example in data:
for span in example.spans:
text = span.text
if use_lower:
text = text.lower()
text = span.text if case_sensitive else span.text.lower()
key = f"{text}{sep}{span.label}"
coverage_map[key] += 1
examples_map[key].append(example)

Просмотреть файл

@ -250,3 +250,11 @@ class Outliers(BaseModel):
low: List[int]
high: List[int]
class Correction(BaseModel):
"""Container for an annotation correction, mapping an annotation from a label to a label"""
annotation: str
from_labels: List[str]
to_label: str