type corrections in recon.v1.fix_annotations

2020-06-25 13:25:41 -07:00 · 2020-06-25 13:25:41 -07:00 · 2b783308c4
--- a/examples/2.0_operations.ipynb
+++ b/examples/2.0_operations.ipynb
@ -4,15 +4,7 @@
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2"
@ -20,20 +12,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\"text\": \"Have you used the new version of my model?\", \"spans\": [{\"start\": 36, \"end\": 41, \"token_start\": 8, \"token_end\": 8, \"label\": \"SKILL\"}], \"tokens\": [{\"text\": \"Have\", \"start\": 0, \"end\": 4, \"id\": 0}, {\"text\": \"you\", \"start\": 5, \"end\": 8, \"id\": 1}, {\"text\": \"used\", \"start\": 9, \"end\": 13, \"id\": 2}, {\"text\": \"the\", \"start\": 14, \"end\": 17, \"id\": 3}, {\"text\": \"new\", \"start\": 18, \"end\": 21, \"id\": 4}, {\"text\": \"version\", \"start\": 22, \"end\": 29, \"id\": 5}, {\"text\": \"of\", \"start\": 30, \"end\": 32, \"id\": 6}, {\"text\": \"my\", \"start\": 33, \"end\": 35, \"id\": 7}, {\"text\": \"model\", \"start\": 36, \"end\": 41, \"id\": 8}, {\"text\": \"?\", \"start\": 41, \"end\": 42, \"id\": 9}]},\n",
-      "{\"text\": \"I'd like to work as an actor or model if possible.\", \"spans\": [{\"text\": \"actor\", \"start\": 23, \"end\": 28, \"token_start\": 7, \"token_end\": 7, \"label\": \"JOB_ROLE\"}, {\"text\": \"model\", \"start\": 32, \"end\": 37, \"token_start\": 9, \"token_end\": 9, \"label\": \"JOB_ROLE\"}], \"tokens\": [{\"text\": \"I\", \"start\": 0, \"end\": 1, \"id\": 0}, {\"text\": \"'d\", \"start\": 1, \"end\": 3, \"id\": 1}, {\"text\": \"like\", \"start\": 4, \"end\": 8, \"id\": 2}, {\"text\": \"to\", \"start\": 9, \"end\": 11, \"id\": 3}, {\"text\": \"work\", \"start\": 12, \"end\": 16, \"id\": 4}, {\"text\": \"as\", \"start\": 17, \"end\": 19, \"id\": 5}, {\"text\": \"an\", \"start\": 20, \"end\": 22, \"id\": 6}, {\"text\": \"actor\", \"start\": 23, \"end\": 28, \"id\": 7}, {\"text\": \"or\", \"start\": 29, \"end\": 31, \"id\": 8}, {\"text\": \"model\", \"start\": 32, \"end\": 37, \"id\": 9}, {\"text\": \"if\", \"start\": 38, \"end\": 40, \"id\": 10}, {\"text\": \"possible\", \"start\": 41, \"end\": 49, \"id\": 11}, {\"text\": \".\", \"start\": 49, \"end\": 50, \"id\": 12}]},\n",
-      "{\"text\": \"We are looking for a Software Development Engineer who has solid coding skills, a strong machine learning background, and is passionate about developing new AI products.\", \"tokens\": [{\"text\": \"We\", \"start\": 0, \"end\": 2, \"id\": 0}, {\"text\": \"are\", \"start\": 3, \"end\": 6, \"id\": 1}, {\"text\": \"looking\", \"start\": 7, \"end\": 14, \"id\": 2}, {\"text\": \"for\", \"start\": 15, \"end\": 18, \"id\": 3}, {\"text\": \"a\", \"start\": 19, \"end\": 20, \"id\": 4}, {\"text\": \"Software\", \"start\": 21, \"end\": 29, \"id\": 5}, {\"text\": \"Development\", \"start\": 30, \"end\": 41, \"id\": 6}, {\"text\": \"Engineer\", \"start\": 42, \"end\": 50, \"id\": 7}, {\"text\": \"who\", \"start\": 51, \"end\": 54, \"id\": 8}, {\"text\": \"has\", \"start\": 55, \"end\": 58, \"id\": 9}, {\"text\": \"solid\", \"start\": 59, \"end\": 64, \"id\": 10}, {\"text\": \"coding\", \"start\": 65, \"end\": 71, \"id\": 11}, {\"text\": \"skills\", \"start\": 72, \"end\": 78, \"id\": 12}, {\"text\": \",\", \"start\": 78, \"end\": 79, \"id\": 13}, {\"text\": \"a\", \"start\": 80, \"end\": 81, \"id\": 14}, {\"text\": \"strong\", \"start\": 82, \"end\": 88, \"id\": 15}, {\"text\": \"machine\", \"start\": 89, \"end\": 96, \"id\": 16}, {\"text\": \"learning\", \"start\": 97, \"end\": 105, \"id\": 17}, {\"text\": \"background\", \"start\": 106, \"end\": 116, \"id\": 18}, {\"text\": \",\", \"start\": 116, \"end\": 117, \"id\": 19}, {\"text\": \"and\", \"start\": 118, \"end\": 121, \"id\": 20}, {\"text\": \"is\", \"start\": 122, \"end\": 124, \"id\": 21}, {\"text\": \"passionate\", \"start\": 125, \"end\": 135, \"id\": 22}, {\"text\": \"about\", \"start\": 136, \"end\": 141, \"id\": 23}, {\"text\": \"developing\", \"start\": 142, \"end\": 152, \"id\": 24}, {\"text\": \"new\", \"start\": 153, \"end\": 156, \"id\": 25}, {\"text\": \"AI\", \"start\": 157, \"end\": 159, \"id\": 26}, {\"text\": \"products\", \"start\": 160, \"end\": 168, \"id\": 27}, {\"text\": \".\", \"start\": 168, \"end\": 169, \"id\": 28}], \"spans\": [{\"start\": 21, \"end\": 50, \"token_start\": 5, \"token_end\": 7, \"label\": \"SKILL\"}, {\"start\": 65, \"end\": 71, \"token_start\": 11, \"token_end\": 11, \"label\": \"SKILL\"}, {\"start\": 89, \"end\": 105, \"token_start\": 16, \"token_end\": 17, \"label\": \"SKILL\"}, {\"start\": 142, \"end\": 152, \"token_start\": 24, \"token_end\": 24, \"label\": \"SKILL\"}, {\"start\": 157, \"end\": 159, \"token_start\": 26, \"token_end\": 26, \"label\": \"SKILL\"}]},\n",
-      "{\"text\": \"Responsibilities As a SOFTWARE DEVELOPMENT ENGINEER II you will work / collaborate with other talented engineers to build features and technologies that will affect millions of your fellow developers in the community.\", \"tokens\": [{\"text\": \"Responsibilities\", \"start\": 0, \"end\": 16, \"id\": 0}, {\"text\": \"As\", \"start\": 17, \"end\": 19, \"id\": 1}, {\"text\": \"a\", \"start\": 20, \"end\": 21, \"id\": 2}, {\"text\": \"SOFTWARE\", \"start\": 22, \"end\": 30, \"id\": 3}, {\"text\": \"DEVELOPMENT\", \"start\": 31, \"end\": 42, \"id\": 4}, {\"text\": \"ENGINEER\", \"start\": 43, \"end\": 51, \"id\": 5}, {\"text\": \"II\", \"start\": 52, \"end\": 54, \"id\": 6}, {\"text\": \"you\", \"start\": 55, \"end\": 58, \"id\": 7}, {\"text\": \"will\", \"start\": 59, \"end\": 63, \"id\": 8}, {\"text\": \"work\", \"start\": 64, \"end\": 68, \"id\": 9}, {\"text\": \"/\", \"start\": 69, \"end\": 70, \"id\": 10}, {\"text\": \"collaborate\", \"start\": 71, \"end\": 82, \"id\": 11}, {\"text\": \"with\", \"start\": 83, \"end\": 87, \"id\": 12}, {\"text\": \"other\", \"start\": 88, \"end\": 93, \"id\": 13}, {\"text\": \"talented\", \"start\": 94, \"end\": 102, \"id\": 14}, {\"text\": \"engineers\", \"start\": 103, \"end\": 112, \"id\": 15}, {\"text\": \"to\", \"start\": 113, \"end\": 115, \"id\": 16}, {\"text\": \"build\", \"start\": 116, \"end\": 121, \"id\": 17}, {\"text\": \"features\", \"start\": 122, \"end\": 130, \"id\": 18}, {\"text\": \"and\", \"start\": 131, \"end\": 134, \"id\": 19}, {\"text\": \"technologies\", \"start\": 135, \"end\": 147, \"id\": 20}, {\"text\": \"that\", \"start\": 148, \"end\": 152, \"id\": 21}, {\"text\": \"will\", \"start\": 153, \"end\": 157, \"id\": 22}, {\"text\": \"affect\", \"start\": 158, \"end\": 164, \"id\": 23}, {\"text\": \"millions\", \"start\": 165, \"end\": 173, \"id\": 24}, {\"text\": \"of\", \"start\": 174, \"end\": 176, \"id\": 25}, {\"text\": \"your\", \"start\": 177, \"end\": 181, \"id\": 26}, {\"text\": \"fellow\", \"start\": 182, \"end\": 188, \"id\": 27}, {\"text\": \"developers\", \"start\": 189, \"end\": 199, \"id\": 28}, {\"text\": \"in\", \"start\": 200, \"end\": 202, \"id\": 29}, {\"text\": \"the\", \"start\": 203, \"end\": 206, \"id\": 30}, {\"text\": \"community\", \"start\": 207, \"end\": 216, \"id\": 31}, {\"text\": \".\", \"start\": 216, \"end\": 217, \"id\": 32}], \"spans\": [{\"start\": 22, \"end\": 51, \"token_start\": 3, \"token_end\": 5, \"label\": \"JOB_ROLE\"}, {\"start\": 71, \"end\": 82, \"token_start\": 11, \"token_end\": 11, \"label\": \"SKILL\"}, {\"start\": 103, \"end\": 112, \"token_start\": 15, \"token_end\": 15, \"label\": \"JOB_ROLE\"}, {\"start\": 135, \"end\": 147, \"token_start\": 20, \"token_end\": 20, \"label\": \"SKILL\"}, {\"start\": 189, \"end\": 199, \"token_start\": 28, \"token_end\": 28, \"label\": \"JOB_ROLE\"}]},\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "raw_examples = [\n",
    "    {\"text\":\"Have you used the new version of my model?\",\"spans\":[{\"start\":36,\"end\":41,\"token_start\":8,\"token_end\":8,\"label\":\"SKILL\"}],\"_input_hash\":1798863398,\"_task_hash\":1273875979,\"tokens\":[{\"text\":\"Have\",\"start\":0,\"end\":4,\"id\":0},{\"text\":\"you\",\"start\":5,\"end\":8,\"id\":1},{\"text\":\"used\",\"start\":9,\"end\":13,\"id\":2},{\"text\":\"the\",\"start\":14,\"end\":17,\"id\":3},{\"text\":\"new\",\"start\":18,\"end\":21,\"id\":4},{\"text\":\"version\",\"start\":22,\"end\":29,\"id\":5},{\"text\":\"of\",\"start\":30,\"end\":32,\"id\":6},{\"text\":\"my\",\"start\":33,\"end\":35,\"id\":7},{\"text\":\"model\",\"start\":36,\"end\":41,\"id\":8},{\"text\":\"?\",\"start\":41,\"end\":42,\"id\":9}],\"_session_id\":None,\"_view_id\":\"ner_manual\",\"answer\":\"accept\"},\n",
@ -62,14 +43,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy\n",
    "from pprint import pprint\n",
    "from pathlib import Path\n",
-    "from typing import Dict, List, Set, Tuple\n",
+    "from typing import Any, Dict, List, Set, Tuple\n",
    "import spacy\n",
    "import srsly\n",
    "# import recon\n",
@ -78,7 +59,7 @@
    "from recon.corrections import fix_annotations\n",
    "from recon.dataset import Dataset\n",
    "from recon.loaders import read_jsonl\n",
-    "from recon.types import Example, PredictionError, HardestExample, NERStats, EntityCoverageStats, EntityCoverage, Transformation, TransformationType, OperationState\n",
+    "from recon.types import Correction, Example, PredictionError, HardestExample, NERStats, EntityCoverageStats, EntityCoverage, Transformation, TransformationType, OperationState\n",
    "from recon.stats import (\n",
    "    get_ner_stats, get_entity_coverage, get_sorted_type_counts, get_probs_from_counts, entropy,\n",
    "    calculate_entity_coverage_entropy, calculate_label_balance_entropy, calculate_label_distribution_similarity,\n",
@ -99,17 +80,24 @@
    {
     "data": {
      "text/plain": [
-       "OrderedDict([('rename_labels', <recon.operations.Operation at 0x7f2d7478ea58>),\n",
-       "             ('fix_annotations',\n",
-       "              <recon.operations.Operation at 0x7f2d7478ea90>),\n",
-       "             ('strip_annotations',\n",
-       "              <recon.operations.Operation at 0x7f2d7478eac8>),\n",
-       "             ('fix_tokenization_and_spacing',\n",
-       "              <recon.operations.Operation at 0x7f2d26453320>),\n",
-       "             ('add_tokens', <recon.operations.Operation at 0x7f2d26453198>),\n",
-       "             ('upcase_labels', <recon.operations.Operation at 0x7f2d264533c8>),\n",
-       "             ('filter_overlaps',\n",
-       "              <recon.operations.Operation at 0x7f2d26453358>)])"
+       "OrderedDict([('recon.v1.rename_labels',\n",
+       "              <recon.operations.Operation at 0x7fc69fd2e940>),\n",
+       "             ('recon.v1.fix_annotations',\n",
+       "              <recon.operations.Operation at 0x7fc69fd2e978>),\n",
+       "             ('recon.v1.strip_annotations',\n",
+       "              <recon.operations.Operation at 0x7fc69cf8cc50>),\n",
+       "             ('recon.v1.split_sentences',\n",
+       "              <recon.operations.Operation at 0x7fc6d95a6c18>),\n",
+       "             ('recon.v1.fix_tokenization_and_spacing',\n",
+       "              <recon.operations.Operation at 0x7fc691de3ef0>),\n",
+       "             ('recon.v1.add_tokens',\n",
+       "              <recon.operations.Operation at 0x7fc691de3e48>),\n",
+       "             ('recon.v1.upcase_labels',\n",
+       "              <recon.operations.Operation at 0x7fc691de3f60>),\n",
+       "             ('recon.v1.filter_overlaps',\n",
+       "              <recon.operations.Operation at 0x7fc691de3f98>),\n",
+       "             ('recon.v1.prodigy.merge_examples',\n",
+       "              <recon.operations.Operation at 0x7fc6900ccbe0>)])"
      ]
     },
     "execution_count": 4,
@ -157,7 +145,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -445,21 +433,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'Corpus' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-11-ca27ac4abad7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcorpus2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCorpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_disk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'./fixed_data/skills'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m: name 'Corpus' is not defined"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "corpus2 = Corpus.from_disk('./fixed_data/skills')"
   ]
@ -504,6 +480,94 @@
    "corpus2.apply(get_label_disparities, \"SKILL\", \"JOB_ROLE\")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 1.  2.  3.  4.  5.]\n",
+      " [ 6.  7.  8.  9. 10.]\n",
+      " [11. 12. 13. 14. 15.]\n",
+      " [16. 17. 18. 19. 20.]\n",
+      " [21. 22. 23. 24. 25.]]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[ True, False,  True, False,  True],\n",
+       "       [False,  True, False,  True, False],\n",
+       "       [ True, False,  True, False,  True],\n",
+       "       [False,  True, False,  True, False],\n",
+       "       [ True, False,  True, False,  True]])"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    " \n",
+    "# Create a 5 x 5 ndarray with consecutive integers from 1 to 25 (inclusive).\n",
+    "# Afterwards use Boolean indexing to pick out only the odd numbers in the array\n",
+    " \n",
+    "# Create a 5 x 5 ndarray with consecutive integers from 1 to 25 (inclusive).\n",
+    "X = np.linspace(1,25,25).reshape(5,5)\n",
+    "print(X)\n",
+    " \n",
+    "# Use Boolean indexing to pick out only the odd numbers in the array\n",
+    "X * X % 2 == 1\n",
+    "# Y = X[]\n",
+    "# print(Y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Correction(annotation='model', from_label='PRODUCT', to_label='SKILL')]"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def corrections_from_dict(corrections_dict: Dict[str, Any]):\n",
+    "    corrections: List[Correction] = []\n",
+    "    for key, val in corrections_dict.items():\n",
+    "        if isinstance(val, str):\n",
+    "            from_label = \"ANY\"\n",
+    "            to_label = val\n",
+    "        elif isinstance(val, tuple):\n",
+    "            from_label = val[0]\n",
+    "            to_label = val[1]\n",
+    "        else:\n",
+    "            raise ValueError(\"Cannot parse corrections dict. Value must be either a str of the label \" +\n",
+    "                             \"to change the annotation to (TO_LABEL) or a tuple of (FROM_LABEL, TO_LABEL)\")\n",
+    "        corrections.append(Correction(\n",
+    "            annotation=key,\n",
+    "            from_label=from_label,\n",
+    "            to_label=to_label\n",
+    "        ))\n",
+    "    return corrections\n",
+    "\n",
+    "corrections_from_dict({\n",
+    "    \"model\": (\"PRODUCT\", \"SKILL\"),\n",
+    "})"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 10,
@ -522,6 +586,9 @@
    }
   ],
   "source": [
+    "corrections = [\n",
+    "    Correction(text=\"\")\n",
+    "]\n",
    "corpus2.apply_(fix_annotations, {\"model\": \"SKILL\"})"
   ]
  },
--- a/pyproject.toml
+++ b/pyproject.toml
@ -28,8 +28,8 @@ classifiers = [
    "License :: OSI Approved :: MIT License"
    ]
 requires = [
-    "pydantic == 1.4",
-    "spacy >= 2.2.1, <3.0.0",
+    "pydantic == 1.5",
+    "spacy >= 2.3.0, <3.0.0",
    "scipy",
    "typer >= 0.0.8",
    "tqdm"
--- a/recon/corrections.py
+++ b/recon/corrections.py
@ -6,10 +6,11 @@ from typing import Any, DefaultDict, Dict, List

 import spacy
 from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan
+from wasabi import msg

 from .operations import operation
 from .preprocess import SpacyPreProcessor
-from .types import Example, Span, Token, TransformationCallbacks
+from .types import Correction, Example, Span, Token, TransformationCallbacks


@operation("recon.v1.rename_labels")
@ -30,7 +31,7 @@ def rename_labels(example: Example, label_map: Dict[str, str]) -> Example:

@operation("recon.v1.fix_annotations")
 def fix_annotations(
-    example: Example, corrections: Dict[str, str], case_sensitive: bool = False
+    example: Example, corrections: List[Correction], case_sensitive: bool = False, dryrun: bool = False
 ) -> Example:
    """Fix annotations in a copy of List[Example] data.
    
@ -43,28 +44,35 @@ def fix_annotations(
        corrections (Dict[str, str]): Dictionary of corrections mapping entity text to a new label.
            If the value is set to None, the annotation will be removed
        case_sensitive (bool, optional): Consider case of text for each correction
+        dryrun (bool, optional): Treat corrections as a dryrun and just print all changes to be made
    
    Returns:
        Example: Example with fixed annotations
    """
-    if case_sensitive:
-        corrections = {t: l for t, l in corrections.items()}
-    else:
-        corrections = {t.lower(): l for t, l in corrections.items()}
+    
+    if not case_sensitive:
+        for c in corrections:
+            c.text = c.text.lower()

-    prints: DefaultDict[str, List[str]] = defaultdict(list)
+    corrections_map: Dict[str, Correction] = {c.annotation: c for c in corrections}
+    prints: List[str] = []

-    ents_to_remove = []
+    ents_to_remove: List[int] = []
    for i, s in enumerate(example.spans):
        t = s.text if case_sensitive else s.text.lower()

-        if t in corrections:
-            if corrections[t] is print:
-                prints[t] += [("=" * 100), example.text, s.label]
-            elif corrections[t] is None:
-                ents_to_remove.append(i)
-            else:
-                s.label = corrections[t]
+        if t in corrections_map:
+            c = corrections_map[t]
+            if c.to_label is None and s.label == c.from_label:
+                if dryrun:
+                    prints.append(f"Deleting span: {s.text}")
+                else:
+                    ents_to_remove.append(i)
+            elif s.label == c.from_label:
+                if dryrun:
+                    prints.append(f"Correction span: {s.text} from label: {c.from_label} to label: {c.to_label}")
+                else:
+                    s.label = c.to_label

    i = len(ents_to_remove) - 1
    while i >= 0:
@ -72,14 +80,55 @@ def fix_annotations(
        del example.spans[idx]
        i -= 1

-    for k in sorted(prints):
-        print(f"**{k}**")
-        for line in prints[k]:
-            print(line)
+    msg.divider("Example Text")
+    msg.text(example.text)
+    for line in prints:
+        msg.text(line)

    return example


+def corrections_from_dict(corrections_dict: Dict[str, Any]):
+    """Create a list of Correction objects from a simpler config for
+    corrections using a Dict representation mapping keys to either the label to 
+    convert to or a tuple of (from_label, to_label) pairings or (List[from_labels], to_label)
+    pairings if you want to convert as subset of labels at a time
+
+    Args:
+        corrections_dict (Dict[str, Any]): Corrections formatted dict
+            e.g. {
+                "united states": "GPE",
+                "London": (["LOC"], "GPE")
+            }
+
+    Raises:
+        ValueError: If the format of the dict
+
+    Returns:
+        [type]: [description]
+    """    
+    corrections: List[Correction] = []
+    for key, val in corrections_dict.items():
+        if isinstance(val, str):
+            from_label = "ANY"
+            to_label = val
+        elif isinstance(val, tuple):
+            if isinstance(val[0], str): 
+                from_labels = [val[0]]
+            else:
+                from_labels = val[0]
+            to_label = val[1]
+        else:
+            raise ValueError("Cannot parse corrections dict. Value must be either a str of the label " +
+                             "to change the annotation to (TO_LABEL) or a tuple of (FROM_LABEL, TO_LABEL)")
+        corrections.append(Correction(
+            annotation=key,
+            from_labels=from_labels,
+            to_label=to_label
+        ))
+    return corrections
+
+
@operation("recon.v1.strip_annotations")
 def strip_annotations(
    example: Example, strip_chars: List[str] = [".", "!", "?", "-", ":", " "]
--- a/recon/dataset.py
+++ b/recon/dataset.py
@ -193,6 +193,8 @@ class Dataset:

        data = loader_func(path)
        self.data = data
+        for example in self.data:
+            self.example_store.add(example)

        if ds_op_state and self.commit_hash != ds_op_state.commit:
            # Dataset changed, examples added
--- a/recon/insights.py
+++ b/recon/insights.py
@ -23,7 +23,7 @@ from .types import (
 )


-def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDict[str, List[str]]:
+def get_ents_by_label(data: List[Example], case_sensitive: bool = False) -> DefaultDict[str, List[str]]:
    """Get a dictionary of unique text spans by label for your data

    # TODO: Ok so this needs to return more than just a set for each label.
@ -39,7 +39,7 @@ def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDic
    
    Args:
        data (List[Example]): List of examples
-        use_lower (bool, optional): Use the lowercase form of the span text.
+        case_sensitive (bool, optional): Consider case of text for each annotation
        sort_by (SortBy): Sort by text or by count
    
    Returns:
@ -51,7 +51,7 @@ def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDic

    for e in data:
        for s in e.spans:
-            span_text = s.text.lower() if use_lower else s.text
+            span_text = s.text if case_sensitive else s.text.lower()
            annotations[s.label].add(span_text)

    for label, anns in annotations.items():
@ -61,7 +61,7 @@ def get_ents_by_label(data: List[Example], use_lower: bool = True) -> DefaultDic


 def get_label_disparities(
-    data: List[Example], label1: str, label2: str, use_lower: bool = True
+    data: List[Example], label1: str, label2: str, case_sensitive: bool = False
 ) -> Set[str]:
    """Identify annotated spans that have different labels in different examples
    
@ -69,24 +69,24 @@ def get_label_disparities(
        data (List[Example]): Input List of examples
        label1 (str): First label to compare
        label2 (str): Second label to compare
-        use_lower (bool, optional): Use the lowercase form of the span text in ents_to_label.
+        case_sensitive (bool, optional): Consider case of text for each annotation
    
    Returns:
        Set[str]: Set of all unique text spans that overlap between label1 and label2
    """
-    annotations = get_ents_by_label(data, use_lower=use_lower)
+    annotations = get_ents_by_label(data, case_sensitive=case_sensitive)
    return set(annotations[label1]).intersection(set(annotations[label2]))


 def top_label_disparities(
-    data: List[Example], use_lower: bool = True, dedupe: bool = False
+    data: List[Example], case_sensitive: bool = False, dedupe: bool = False
 ) -> List[LabelDisparity]:
    """Identify annotated spans that have different labels
    in different examples for all label pairs in data.
    
    Args:
        data (List[Example]): Input List of examples
-        use_lower (bool, optional): Use the lowercase form of the span text in ents_to_label.
+        case_sensitive (bool, optional): Consider case of text for each annotation
        dedupe (bool, optional): Whether to deduplicate for table view vs confusion matrix.
            False by default for easy confusion matrix display.
    
@ -94,7 +94,7 @@ def top_label_disparities(
        List[LabelDisparity]: List of LabelDisparity objects for each label pair combination
            sorted by the number of disparities between them.
    """
-    annotations = get_ents_by_label(data, use_lower=use_lower)
+    annotations = get_ents_by_label(data, case_sensitive=case_sensitive)
    label_disparities = {}
    for label1 in annotations.keys():
        for label2 in annotations.keys():
--- a/recon/stats.py
+++ b/recon/stats.py
@ -112,7 +112,7 @@ def calculate_label_distribution_similarity(x: List[Example], y: List[Example])


 def get_entity_coverage(
-    data: List[Example], sep: str = "||", use_lower: bool = True, return_examples: bool = False,
+    data: List[Example], sep: str = "||", case_sensitive: bool = False, return_examples: bool = False,
 ) -> List[EntityCoverage]:
    """Identify how well you dataset covers an entity type. Get insights
    on the how many times certain text/label span combinations exist across your
@ -123,7 +123,7 @@ def get_entity_coverage(
        data (List[Example]): List of examples
        sep (str, optional): Separator used in coverage map, only change if || exists in your text
            or label.
-        use_lower (bool, optional): Use the lowercase form of the span text in ents_to_label.
+        case_sensitive (bool, optional): Consider case of text for each annotation
        return_examples (bool, optional): Return Examples that contain the entity label annotation.
    
    Returns:
@ -135,9 +135,7 @@ def get_entity_coverage(

    for example in data:
        for span in example.spans:
-            text = span.text
-            if use_lower:
-                text = text.lower()
+            text = span.text if case_sensitive else span.text.lower()
            key = f"{text}{sep}{span.label}"
            coverage_map[key] += 1
            examples_map[key].append(example)
--- a/recon/types.py
+++ b/recon/types.py
@ -250,3 +250,11 @@ class Outliers(BaseModel):

    low: List[int]
    high: List[int]
+
+
+class Correction(BaseModel):
+    """Container for an annotation correction, mapping an annotation from a label to a label"""
+
+    annotation: str
+    from_labels: List[str]
+    to_label: str