зеркало из https://github.com/microsoft/reconner.git
type corrections in recon.v1.fix_annotations
This commit is contained in:
Родитель
655a8796d9
Коммит
2b783308c4
|
@ -4,15 +4,7 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%reload_ext autoreload\n",
|
||||
"%autoreload 2"
|
||||
|
@ -20,20 +12,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{\"text\": \"Have you used the new version of my model?\", \"spans\": [{\"start\": 36, \"end\": 41, \"token_start\": 8, \"token_end\": 8, \"label\": \"SKILL\"}], \"tokens\": [{\"text\": \"Have\", \"start\": 0, \"end\": 4, \"id\": 0}, {\"text\": \"you\", \"start\": 5, \"end\": 8, \"id\": 1}, {\"text\": \"used\", \"start\": 9, \"end\": 13, \"id\": 2}, {\"text\": \"the\", \"start\": 14, \"end\": 17, \"id\": 3}, {\"text\": \"new\", \"start\": 18, \"end\": 21, \"id\": 4}, {\"text\": \"version\", \"start\": 22, \"end\": 29, \"id\": 5}, {\"text\": \"of\", \"start\": 30, \"end\": 32, \"id\": 6}, {\"text\": \"my\", \"start\": 33, \"end\": 35, \"id\": 7}, {\"text\": \"model\", \"start\": 36, \"end\": 41, \"id\": 8}, {\"text\": \"?\", \"start\": 41, \"end\": 42, \"id\": 9}]},\n",
|
||||
"{\"text\": \"I'd like to work as an actor or model if possible.\", \"spans\": [{\"text\": \"actor\", \"start\": 23, \"end\": 28, \"token_start\": 7, \"token_end\": 7, \"label\": \"JOB_ROLE\"}, {\"text\": \"model\", \"start\": 32, \"end\": 37, \"token_start\": 9, \"token_end\": 9, \"label\": \"JOB_ROLE\"}], \"tokens\": [{\"text\": \"I\", \"start\": 0, \"end\": 1, \"id\": 0}, {\"text\": \"'d\", \"start\": 1, \"end\": 3, \"id\": 1}, {\"text\": \"like\", \"start\": 4, \"end\": 8, \"id\": 2}, {\"text\": \"to\", \"start\": 9, \"end\": 11, \"id\": 3}, {\"text\": \"work\", \"start\": 12, \"end\": 16, \"id\": 4}, {\"text\": \"as\", \"start\": 17, \"end\": 19, \"id\": 5}, {\"text\": \"an\", \"start\": 20, \"end\": 22, \"id\": 6}, {\"text\": \"actor\", \"start\": 23, \"end\": 28, \"id\": 7}, {\"text\": \"or\", \"start\": 29, \"end\": 31, \"id\": 8}, {\"text\": \"model\", \"start\": 32, \"end\": 37, \"id\": 9}, {\"text\": \"if\", \"start\": 38, \"end\": 40, \"id\": 10}, {\"text\": \"possible\", \"start\": 41, \"end\": 49, \"id\": 11}, {\"text\": \".\", \"start\": 49, \"end\": 50, \"id\": 12}]},\n",
|
||||
"{\"text\": \"We are looking for a Software Development Engineer who has solid coding skills, a strong machine learning background, and is passionate about developing new AI products.\", \"tokens\": [{\"text\": \"We\", \"start\": 0, \"end\": 2, \"id\": 0}, {\"text\": \"are\", \"start\": 3, \"end\": 6, \"id\": 1}, {\"text\": \"looking\", \"start\": 7, \"end\": 14, \"id\": 2}, {\"text\": \"for\", \"start\": 15, \"end\": 18, \"id\": 3}, {\"text\": \"a\", \"start\": 19, \"end\": 20, \"id\": 4}, {\"text\": \"Software\", \"start\": 21, \"end\": 29, \"id\": 5}, {\"text\": \"Development\", \"start\": 30, \"end\": 41, \"id\": 6}, {\"text\": \"Engineer\", \"start\": 42, \"end\": 50, \"id\": 7}, {\"text\": \"who\", \"start\": 51, \"end\": 54, \"id\": 8}, {\"text\": \"has\", \"start\": 55, \"end\": 58, \"id\": 9}, {\"text\": \"solid\", \"start\": 59, \"end\": 64, \"id\": 10}, {\"text\": \"coding\", \"start\": 65, \"end\": 71, \"id\": 11}, {\"text\": \"skills\", \"start\": 72, \"end\": 78, \"id\": 12}, {\"text\": \",\", \"start\": 78, \"end\": 79, \"id\": 13}, {\"text\": \"a\", \"start\": 80, \"end\": 81, \"id\": 14}, {\"text\": \"strong\", \"start\": 82, \"end\": 88, \"id\": 15}, {\"text\": \"machine\", \"start\": 89, \"end\": 96, \"id\": 16}, {\"text\": \"learning\", \"start\": 97, \"end\": 105, \"id\": 17}, {\"text\": \"background\", \"start\": 106, \"end\": 116, \"id\": 18}, {\"text\": \",\", \"start\": 116, \"end\": 117, \"id\": 19}, {\"text\": \"and\", \"start\": 118, \"end\": 121, \"id\": 20}, {\"text\": \"is\", \"start\": 122, \"end\": 124, \"id\": 21}, {\"text\": \"passionate\", \"start\": 125, \"end\": 135, \"id\": 22}, {\"text\": \"about\", \"start\": 136, \"end\": 141, \"id\": 23}, {\"text\": \"developing\", \"start\": 142, \"end\": 152, \"id\": 24}, {\"text\": \"new\", \"start\": 153, \"end\": 156, \"id\": 25}, {\"text\": \"AI\", \"start\": 157, \"end\": 159, \"id\": 26}, {\"text\": \"products\", \"start\": 160, \"end\": 168, \"id\": 27}, {\"text\": \".\", \"start\": 168, \"end\": 169, \"id\": 28}], \"spans\": [{\"start\": 21, \"end\": 50, \"token_start\": 5, \"token_end\": 7, \"label\": \"SKILL\"}, {\"start\": 65, \"end\": 71, \"token_start\": 11, \"token_end\": 11, \"label\": \"SKILL\"}, {\"start\": 89, \"end\": 105, \"token_start\": 16, \"token_end\": 17, \"label\": \"SKILL\"}, {\"start\": 142, \"end\": 152, \"token_start\": 24, \"token_end\": 24, \"label\": \"SKILL\"}, {\"start\": 157, \"end\": 159, \"token_start\": 26, \"token_end\": 26, \"label\": \"SKILL\"}]},\n",
|
||||
"{\"text\": \"Responsibilities As a SOFTWARE DEVELOPMENT ENGINEER II you will work / collaborate with other talented engineers to build features and technologies that will affect millions of your fellow developers in the community.\", \"tokens\": [{\"text\": \"Responsibilities\", \"start\": 0, \"end\": 16, \"id\": 0}, {\"text\": \"As\", \"start\": 17, \"end\": 19, \"id\": 1}, {\"text\": \"a\", \"start\": 20, \"end\": 21, \"id\": 2}, {\"text\": \"SOFTWARE\", \"start\": 22, \"end\": 30, \"id\": 3}, {\"text\": \"DEVELOPMENT\", \"start\": 31, \"end\": 42, \"id\": 4}, {\"text\": \"ENGINEER\", \"start\": 43, \"end\": 51, \"id\": 5}, {\"text\": \"II\", \"start\": 52, \"end\": 54, \"id\": 6}, {\"text\": \"you\", \"start\": 55, \"end\": 58, \"id\": 7}, {\"text\": \"will\", \"start\": 59, \"end\": 63, \"id\": 8}, {\"text\": \"work\", \"start\": 64, \"end\": 68, \"id\": 9}, {\"text\": \"/\", \"start\": 69, \"end\": 70, \"id\": 10}, {\"text\": \"collaborate\", \"start\": 71, \"end\": 82, \"id\": 11}, {\"text\": \"with\", \"start\": 83, \"end\": 87, \"id\": 12}, {\"text\": \"other\", \"start\": 88, \"end\": 93, \"id\": 13}, {\"text\": \"talented\", \"start\": 94, \"end\": 102, \"id\": 14}, {\"text\": \"engineers\", \"start\": 103, \"end\": 112, \"id\": 15}, {\"text\": \"to\", \"start\": 113, \"end\": 115, \"id\": 16}, {\"text\": \"build\", \"start\": 116, \"end\": 121, \"id\": 17}, {\"text\": \"features\", \"start\": 122, \"end\": 130, \"id\": 18}, {\"text\": \"and\", \"start\": 131, \"end\": 134, \"id\": 19}, {\"text\": \"technologies\", \"start\": 135, \"end\": 147, \"id\": 20}, {\"text\": \"that\", \"start\": 148, \"end\": 152, \"id\": 21}, {\"text\": \"will\", \"start\": 153, \"end\": 157, \"id\": 22}, {\"text\": \"affect\", \"start\": 158, \"end\": 164, \"id\": 23}, {\"text\": \"millions\", \"start\": 165, \"end\": 173, \"id\": 24}, {\"text\": \"of\", \"start\": 174, \"end\": 176, \"id\": 25}, {\"text\": \"your\", \"start\": 177, \"end\": 181, \"id\": 26}, {\"text\": \"fellow\", \"start\": 182, \"end\": 188, \"id\": 27}, {\"text\": \"developers\", \"start\": 189, \"end\": 199, \"id\": 28}, {\"text\": \"in\", \"start\": 200, \"end\": 202, \"id\": 29}, {\"text\": \"the\", \"start\": 203, \"end\": 206, \"id\": 30}, {\"text\": \"community\", \"start\": 207, \"end\": 216, \"id\": 31}, {\"text\": \".\", \"start\": 216, \"end\": 217, \"id\": 32}], \"spans\": [{\"start\": 22, \"end\": 51, \"token_start\": 3, \"token_end\": 5, \"label\": \"JOB_ROLE\"}, {\"start\": 71, \"end\": 82, \"token_start\": 11, \"token_end\": 11, \"label\": \"SKILL\"}, {\"start\": 103, \"end\": 112, \"token_start\": 15, \"token_end\": 15, \"label\": \"JOB_ROLE\"}, {\"start\": 135, \"end\": 147, \"token_start\": 20, \"token_end\": 20, \"label\": \"SKILL\"}, {\"start\": 189, \"end\": 199, \"token_start\": 28, \"token_end\": 28, \"label\": \"JOB_ROLE\"}]},\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"raw_examples = [\n",
|
||||
" {\"text\":\"Have you used the new version of my model?\",\"spans\":[{\"start\":36,\"end\":41,\"token_start\":8,\"token_end\":8,\"label\":\"SKILL\"}],\"_input_hash\":1798863398,\"_task_hash\":1273875979,\"tokens\":[{\"text\":\"Have\",\"start\":0,\"end\":4,\"id\":0},{\"text\":\"you\",\"start\":5,\"end\":8,\"id\":1},{\"text\":\"used\",\"start\":9,\"end\":13,\"id\":2},{\"text\":\"the\",\"start\":14,\"end\":17,\"id\":3},{\"text\":\"new\",\"start\":18,\"end\":21,\"id\":4},{\"text\":\"version\",\"start\":22,\"end\":29,\"id\":5},{\"text\":\"of\",\"start\":30,\"end\":32,\"id\":6},{\"text\":\"my\",\"start\":33,\"end\":35,\"id\":7},{\"text\":\"model\",\"start\":36,\"end\":41,\"id\":8},{\"text\":\"?\",\"start\":41,\"end\":42,\"id\":9}],\"_session_id\":None,\"_view_id\":\"ner_manual\",\"answer\":\"accept\"},\n",
|
||||
|
@ -62,14 +43,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import copy\n",
|
||||
"from pprint import pprint\n",
|
||||
"from pathlib import Path\n",
|
||||
"from typing import Dict, List, Set, Tuple\n",
|
||||
"from typing import Any, Dict, List, Set, Tuple\n",
|
||||
"import spacy\n",
|
||||
"import srsly\n",
|
||||
"# import recon\n",
|
||||
|
@ -78,7 +59,7 @@
|
|||
"from recon.corrections import fix_annotations\n",
|
||||
"from recon.dataset import Dataset\n",
|
||||
"from recon.loaders import read_jsonl\n",
|
||||
"from recon.types import Example, PredictionError, HardestExample, NERStats, EntityCoverageStats, EntityCoverage, Transformation, TransformationType, OperationState\n",
|
||||
"from recon.types import Correction, Example, PredictionError, HardestExample, NERStats, EntityCoverageStats, EntityCoverage, Transformation, TransformationType, OperationState\n",
|
||||
"from recon.stats import (\n",
|
||||
" get_ner_stats, get_entity_coverage, get_sorted_type_counts, get_probs_from_counts, entropy,\n",
|
||||
" calculate_entity_coverage_entropy, calculate_label_balance_entropy, calculate_label_distribution_similarity,\n",
|
||||
|
@ -99,17 +80,24 @@
|
|||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"OrderedDict([('rename_labels', <recon.operations.Operation at 0x7f2d7478ea58>),\n",
|
||||
" ('fix_annotations',\n",
|
||||
" <recon.operations.Operation at 0x7f2d7478ea90>),\n",
|
||||
" ('strip_annotations',\n",
|
||||
" <recon.operations.Operation at 0x7f2d7478eac8>),\n",
|
||||
" ('fix_tokenization_and_spacing',\n",
|
||||
" <recon.operations.Operation at 0x7f2d26453320>),\n",
|
||||
" ('add_tokens', <recon.operations.Operation at 0x7f2d26453198>),\n",
|
||||
" ('upcase_labels', <recon.operations.Operation at 0x7f2d264533c8>),\n",
|
||||
" ('filter_overlaps',\n",
|
||||
" <recon.operations.Operation at 0x7f2d26453358>)])"
|
||||
"OrderedDict([('recon.v1.rename_labels',\n",
|
||||
" <recon.operations.Operation at 0x7fc69fd2e940>),\n",
|
||||
" ('recon.v1.fix_annotations',\n",
|
||||
" <recon.operations.Operation at 0x7fc69fd2e978>),\n",
|
||||
" ('recon.v1.strip_annotations',\n",
|
||||
" <recon.operations.Operation at 0x7fc69cf8cc50>),\n",
|
||||
" ('recon.v1.split_sentences',\n",
|
||||
" <recon.operations.Operation at 0x7fc6d95a6c18>),\n",
|
||||
" ('recon.v1.fix_tokenization_and_spacing',\n",
|
||||
" <recon.operations.Operation at 0x7fc691de3ef0>),\n",
|
||||
" ('recon.v1.add_tokens',\n",
|
||||
" <recon.operations.Operation at 0x7fc691de3e48>),\n",
|
||||
" ('recon.v1.upcase_labels',\n",
|
||||
" <recon.operations.Operation at 0x7fc691de3f60>),\n",
|
||||
" ('recon.v1.filter_overlaps',\n",
|
||||
" <recon.operations.Operation at 0x7fc691de3f98>),\n",
|
||||
" ('recon.v1.prodigy.merge_examples',\n",
|
||||
" <recon.operations.Operation at 0x7fc6900ccbe0>)])"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
|
@ -157,7 +145,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -445,21 +433,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'Corpus' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-11-ca27ac4abad7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcorpus2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCorpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_disk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'./fixed_data/skills'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'Corpus' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"corpus2 = Corpus.from_disk('./fixed_data/skills')"
|
||||
]
|
||||
|
@ -504,6 +480,94 @@
|
|||
"corpus2.apply(get_label_disparities, \"SKILL\", \"JOB_ROLE\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[ 1. 2. 3. 4. 5.]\n",
|
||||
" [ 6. 7. 8. 9. 10.]\n",
|
||||
" [11. 12. 13. 14. 15.]\n",
|
||||
" [16. 17. 18. 19. 20.]\n",
|
||||
" [21. 22. 23. 24. 25.]]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([[ True, False, True, False, True],\n",
|
||||
" [False, True, False, True, False],\n",
|
||||
" [ True, False, True, False, True],\n",
|
||||
" [False, True, False, True, False],\n",
|
||||
" [ True, False, True, False, True]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
" \n",
|
||||
"# Create a 5 x 5 ndarray with consecutive integers from 1 to 25 (inclusive).\n",
|
||||
"# Afterwards use Boolean indexing to pick out only the odd numbers in the array\n",
|
||||
" \n",
|
||||
"# Create a 5 x 5 ndarray with consecutive integers from 1 to 25 (inclusive).\n",
|
||||
"X = np.linspace(1,25,25).reshape(5,5)\n",
|
||||
"print(X)\n",
|
||||
" \n",
|
||||
"# Use Boolean indexing to pick out only the odd numbers in the array\n",
|
||||
"X * X % 2 == 1\n",
|
||||
"# Y = X[]\n",
|
||||
"# print(Y)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Correction(annotation='model', from_label='PRODUCT', to_label='SKILL')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def corrections_from_dict(corrections_dict: Dict[str, Any]):\n",
|
||||
" corrections: List[Correction] = []\n",
|
||||
" for key, val in corrections_dict.items():\n",
|
||||
" if isinstance(val, str):\n",
|
||||
" from_label = \"ANY\"\n",
|
||||
" to_label = val\n",
|
||||
" elif isinstance(val, tuple):\n",
|
||||
" from_label = val[0]\n",
|
||||
" to_label = val[1]\n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"Cannot parse corrections dict. Value must be either a str of the label \" +\n",
|
||||
" \"to change the annotation to (TO_LABEL) or a tuple of (FROM_LABEL, TO_LABEL)\")\n",
|
||||
" corrections.append(Correction(\n",
|
||||
" annotation=key,\n",
|
||||
" from_label=from_label,\n",
|
||||
" to_label=to_label\n",
|
||||
" ))\n",
|
||||
" return corrections\n",
|
||||
"\n",
|
||||
"corrections_from_dict({\n",
|
||||
" \"model\": (\"PRODUCT\", \"SKILL\"),\n",
|
||||
"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
|
@ -522,6 +586,9 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"corrections = [\n",
|
||||
" Correction(text=\"\")\n",
|
||||
"]\n",
|
||||
"corpus2.apply_(fix_annotations, {\"model\": \"SKILL\"})"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -28,8 +28,8 @@ classifiers = [
|
|||
"License :: OSI Approved :: MIT License"
|
||||
]
|
||||
requires = [
|
||||
"pydantic == 1.4",
|
||||
"spacy >= 2.2.1, <3.0.0",
|
||||
"pydantic == 1.5",
|
||||
"spacy >= 2.3.0, <3.0.0",
|
||||
"scipy",
|
||||
"typer >= 0.0.8",
|
||||
"tqdm"
|
||||
|
|
|
@ -6,10 +6,11 @@ from typing import Any, DefaultDict, Dict, List
|
|||
|
||||
import spacy
|
||||
from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan
|
||||
from wasabi import msg
|
||||
|
||||
from .operations import operation
|
||||
from .preprocess import SpacyPreProcessor
|
||||
from .types import Example, Span, Token, TransformationCallbacks
|
||||
from .types import Correction, Example, Span, Token, TransformationCallbacks
|
||||
|
||||
|
||||
@operation("recon.v1.rename_labels")
|
||||
|
@ -30,7 +31,7 @@ def rename_labels(example: Example, label_map: Dict[str, str]) -> Example:
|
|||
|
||||
@operation("recon.v1.fix_annotations")
|
||||
def fix_annotations(
|
||||
example: Example, corrections: Dict[str, str], case_sensitive: bool = False
|
||||
example: Example, corrections: List[Correction], case_sensitive: bool = False, dryrun: bool = False
|
||||
) -> Example:
|
||||
"""Fix annotations in a copy of List[Example] data.
|
||||
|
||||
|
@ -43,28 +44,35 @@ def fix_annotations(
|
|||
corrections (Dict[str, str]): Dictionary of corrections mapping entity text to a new label.
|
||||
If the value is set to None, the annotation will be removed
|
||||
case_sensitive (bool, optional): Consider case of text for each correction
|
||||
dryrun (bool, optional): Treat corrections as a dryrun and just print all changes to be made
|
||||
|
||||
Returns:
|
||||
Example: Example with fixed annotations
|
||||
"""
|
||||
if case_sensitive:
|
||||
corrections = {t: l for t, l in corrections.items()}
|
||||
else:
|
||||
corrections = {t.lower(): l for t, l in corrections.items()}
|
||||
|
||||
if not case_sensitive:
|
||||
for c in corrections:
|
||||
c.text = c.text.lower()
|
||||
|
||||
prints: DefaultDict[str, List[str]] = defaultdict(list)
|
||||
corrections_map: Dict[str, Correction] = {c.annotation: c for c in corrections}
|
||||
prints: List[str] = []
|
||||
|
||||
ents_to_remove = []
|
||||
ents_to_remove: List[int] = []
|
||||
for i, s in enumerate(example.spans):
|
||||
t = s.text if case_sensitive else s.text.lower()
|
||||
|
||||
if t in corrections:
|
||||
if corrections[t] is print:
|
||||
prints[t] += [("=" * 100), example.text, s.label]
|
||||
elif corrections[t] is None:
|
||||
ents_to_remove.append(i)
|
||||
else:
|
||||
s.label = corrections[t]
|
||||
if t in corrections_map:
|
||||
c = corrections_map[t]
|
||||
if c.to_label is None and s.label == c.from_label:
|
||||
if dryrun:
|
||||
prints.append(f"Deleting span: {s.text}")
|
||||
else:
|
||||
ents_to_remove.append(i)
|
||||
elif s.label == c.from_label:
|
||||
if dryrun:
|
||||
prints.append(f"Correction span: {s.text} from label: {c.from_label} to label: {c.to_label}")
|
||||
else:
|
||||
s.label = c.to_label
|
||||
|
||||
i = len(ents_to_remove) - 1
|
||||
while i >= 0:
|
||||
|
@ -72,14 +80,55 @@ def fix_annotations(
|
|||
del example.spans[idx]
|
||||
i -= 1
|
||||
|
||||
for k in sorted(prints):
|
||||
print(f"**{k}**")
|
||||
for line in prints[k]:
|
||||
print(line)
|
||||
msg.divider("Example Text")
|
||||
msg.text(example.text)
|
||||
for line in prints:
|
||||
msg.text(line)
|
||||
|
||||
return example
|
||||
|
||||
|
||||
def corrections_from_dict(corrections_dict: Dict[str, Any]):
|
||||
"""Create a list of Correction objects from a simpler config for
|
||||
corrections using a Dict representation mapping keys to either the label to
|
||||
convert to or a tuple of (from_label, to_label) pairings or (List[from_labels], to_label)
|
||||
pairings if you want to convert as subset of labels at a time
|
||||
|
||||
Args:
|
||||
corrections_dict (Dict[str, Any]): Corrections formatted dict
|
||||
e.g. {
|
||||
"united states": "GPE",
|
||||
"London": (["LOC"], "GPE")
|
||||
}
|
||||
|
||||
Raises:
|
||||
ValueError: If the format of the dict
|
||||
|
||||
Returns:
|
||||
[type]: [description]
|
||||
"""
|
||||
corrections: List[Correction] = []
|
||||
for key, val in corrections_dict.items():
|
||||
if isinstance(val, str):
|
||||
from_label = "ANY"
|
||||
to_label = val
|
||||
elif isinstance(val, tuple):
|
||||
if isinstance(val[0], str):
|
||||
from_labels = [val[0]]
|
||||
else:
|
||||
from_labels = val[0]
|
||||
to_label = val[1]
|
||||
else:
|
||||
raise ValueError("Cannot parse corrections dict. Value must be either a str of the label " +
|
||||
"to change the annotation to (TO_LABEL) or a tuple of (FROM_LABEL, TO_LABEL)")
|
||||
corrections.append(Correction(
|
||||
annotation=key,
|
||||
from_labels=from_labels,
|
||||
to_label=to_label
|
||||
))
|
||||
return corrections
|
||||
|
||||
|
||||
@operation("recon.v1.strip_annotations")
|
||||
def strip_annotations(
|
||||
example: Example, strip_chars: List[str] = [".", "!", "?", "-", ":", " "]
|
||||
|
|
|
@ -193,6 +193,8 @@ class Dataset:
|
|||
|
||||
data = loader_func(path)
|
||||
self.data = data
|
||||
for example in self.data:
|
||||
self.example_store.add(example)
|
||||
|
||||
if ds_op_state and self.commit_hash != ds_op_state.commit:
|
||||
# Dataset changed, examples added
|
||||
|
|
|
@ -23,7 +23,7 @@ from .types import (
|
|||
)
|
||||
|
||||
|
||||
def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDict[str, List[str]]:
|
||||
def get_ents_by_label(data: List[Example], case_sensitive: bool = False) -> DefaultDict[str, List[str]]:
|
||||
"""Get a dictionary of unique text spans by label for your data
|
||||
|
||||
# TODO: Ok so this needs to return more than just a set for each label.
|
||||
|
@ -39,7 +39,7 @@ def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDic
|
|||
|
||||
Args:
|
||||
data (List[Example]): List of examples
|
||||
use_lower (bool, optional): Use the lowercase form of the span text.
|
||||
case_sensitive (bool, optional): Consider case of text for each annotation
|
||||
sort_by (SortBy): Sort by text or by count
|
||||
|
||||
Returns:
|
||||
|
@ -51,7 +51,7 @@ def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDic
|
|||
|
||||
for e in data:
|
||||
for s in e.spans:
|
||||
span_text = s.text.lower() if use_lower else s.text
|
||||
span_text = s.text if case_sensitive else s.text.lower()
|
||||
annotations[s.label].add(span_text)
|
||||
|
||||
for label, anns in annotations.items():
|
||||
|
@ -61,7 +61,7 @@ def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDic
|
|||
|
||||
|
||||
def get_label_disparities(
|
||||
data: List[Example], label1: str, label2: str, use_lower: bool = True
|
||||
data: List[Example], label1: str, label2: str, case_sensitive: bool = False
|
||||
) -> Set[str]:
|
||||
"""Identify annotated spans that have different labels in different examples
|
||||
|
||||
|
@ -69,24 +69,24 @@ def get_label_disparities(
|
|||
data (List[Example]): Input List of examples
|
||||
label1 (str): First label to compare
|
||||
label2 (str): Second label to compare
|
||||
use_lower (bool, optional): Use the lowercase form of the span text in ents_to_label.
|
||||
case_sensitive (bool, optional): Consider case of text for each annotation
|
||||
|
||||
Returns:
|
||||
Set[str]: Set of all unique text spans that overlap between label1 and label2
|
||||
"""
|
||||
annotations = get_ents_by_label(data, use_lower=use_lower)
|
||||
annotations = get_ents_by_label(data, case_sensitive=case_sensitive)
|
||||
return set(annotations[label1]).intersection(set(annotations[label2]))
|
||||
|
||||
|
||||
def top_label_disparities(
|
||||
data: List[Example], use_lower: bool = True, dedupe: bool = False
|
||||
data: List[Example], case_sensitive: bool = False, dedupe: bool = False
|
||||
) -> List[LabelDisparity]:
|
||||
"""Identify annotated spans that have different labels
|
||||
in different examples for all label pairs in data.
|
||||
|
||||
Args:
|
||||
data (List[Example]): Input List of examples
|
||||
use_lower (bool, optional): Use the lowercase form of the span text in ents_to_label.
|
||||
case_sensitive (bool, optional): Consider case of text for each annotation
|
||||
dedupe (bool, optional): Whether to deduplicate for table view vs confusion matrix.
|
||||
False by default for easy confusion matrix display.
|
||||
|
||||
|
@ -94,7 +94,7 @@ def top_label_disparities(
|
|||
List[LabelDisparity]: List of LabelDisparity objects for each label pair combination
|
||||
sorted by the number of disparities between them.
|
||||
"""
|
||||
annotations = get_ents_by_label(data, use_lower=use_lower)
|
||||
annotations = get_ents_by_label(data, case_sensitive=case_sensitive)
|
||||
label_disparities = {}
|
||||
for label1 in annotations.keys():
|
||||
for label2 in annotations.keys():
|
||||
|
|
|
@ -112,7 +112,7 @@ def calculate_label_distribution_similarity(x: List[Example], y: List[Example])
|
|||
|
||||
|
||||
def get_entity_coverage(
|
||||
data: List[Example], sep: str = "||", use_lower: bool = True, return_examples: bool = False,
|
||||
data: List[Example], sep: str = "||", case_sensitive: bool = False, return_examples: bool = False,
|
||||
) -> List[EntityCoverage]:
|
||||
"""Identify how well you dataset covers an entity type. Get insights
|
||||
on the how many times certain text/label span combinations exist across your
|
||||
|
@ -123,7 +123,7 @@ def get_entity_coverage(
|
|||
data (List[Example]): List of examples
|
||||
sep (str, optional): Separator used in coverage map, only change if || exists in your text
|
||||
or label.
|
||||
use_lower (bool, optional): Use the lowercase form of the span text in ents_to_label.
|
||||
case_sensitive (bool, optional): Consider case of text for each annotation
|
||||
return_examples (bool, optional): Return Examples that contain the entity label annotation.
|
||||
|
||||
Returns:
|
||||
|
@ -135,9 +135,7 @@ def get_entity_coverage(
|
|||
|
||||
for example in data:
|
||||
for span in example.spans:
|
||||
text = span.text
|
||||
if use_lower:
|
||||
text = text.lower()
|
||||
text = span.text if case_sensitive else span.text.lower()
|
||||
key = f"{text}{sep}{span.label}"
|
||||
coverage_map[key] += 1
|
||||
examples_map[key].append(example)
|
||||
|
|
|
@ -250,3 +250,11 @@ class Outliers(BaseModel):
|
|||
|
||||
low: List[int]
|
||||
high: List[int]
|
||||
|
||||
|
||||
class Correction(BaseModel):
|
||||
"""Container for an annotation correction, mapping an annotation from a label to a label"""
|
||||
|
||||
annotation: str
|
||||
from_labels: List[str]
|
||||
to_label: str
|
||||
|
|
Загрузка…
Ссылка в новой задаче