From fb73c8e7f9f58faccca7de7ee6e86acac2ea1c51 Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Wed, 22 Apr 2020 11:03:11 -0700 Subject: [PATCH] Adding docs and fixing tokenization add_tokens to use token_end as t.i + 1 for consistency with spaCy. --- docs/api/operations.md | 9 + docs/operations_registry.py | 0 docs/src/tutorial/1_stats.py | 10 +- docs/src/tutorial/3_dataset_mutate.py | 23 ++ docs/src/tutorial/3_dataset_mutate_save.py | 25 ++ docs/src/tutorial/3_dataset_stats.py | 15 + docs/tutorial/1_loading_data.md | 93 ++---- docs/tutorial/2_ner_stats.md | 49 ++- docs/tutorial/3_dataset_intro.md | 126 +++++++ docs/tutorial/3_dataset_mutate.md | 311 ++++++++++++++++++ .../{3_corpus_apply.md => 4_corpus.md} | 0 docs/tutorial/4_more_stats.md | 9 +- examples/2.0_operations.ipynb | 31 +- mkdocs.yml | 3 + recon/operations.py | 5 +- recon/prodigy/ner_mirror.py | 133 -------- recon/tokenization.py | 20 +- 17 files changed, 624 insertions(+), 238 deletions(-) create mode 100644 docs/operations_registry.py create mode 100644 docs/src/tutorial/3_dataset_mutate.py create mode 100644 docs/src/tutorial/3_dataset_mutate_save.py create mode 100644 docs/src/tutorial/3_dataset_stats.py create mode 100644 docs/tutorial/3_dataset_intro.md create mode 100644 docs/tutorial/3_dataset_mutate.md rename docs/tutorial/{3_corpus_apply.md => 4_corpus.md} (100%) delete mode 100644 recon/prodigy/ner_mirror.py diff --git a/docs/api/operations.md b/docs/api/operations.md index bb8af4d..0891663 100644 --- a/docs/api/operations.md +++ b/docs/api/operations.md @@ -1 +1,10 @@ +Operations are functions that operate on either a list of the examples or a single example. +If the function operates on a single example, Recon will take care of applying it to all examples in a dataset. + +The following operations are built into Recon + + +!!!error + ... full list of operations to come + ::: recon.operations diff --git a/docs/operations_registry.py b/docs/operations_registry.py new file mode 100644 index 0000000..e69de29 diff --git a/docs/src/tutorial/1_stats.py b/docs/src/tutorial/1_stats.py index 7287198..389e667 100644 --- a/docs/src/tutorial/1_stats.py +++ b/docs/src/tutorial/1_stats.py @@ -1,14 +1,14 @@ from pathlib import Path import typer -from recon.dataset import Dataset +from recon.loaders import read_jsonl from recon.stats import get_ner_stats -def main(data_dir: Path): - ds = Dataset.from_disk(data_dir) - train_stats = get_ner_stats(ds.train) - get_ner_stats(ds.train, serialize=True) +def main(data_file: Path): + data = read_jsonl(data_file) + train_stats = get_ner_stats(data) + print(get_ner_stats(data, serialize=True)) if __name__ == "__main__": diff --git a/docs/src/tutorial/3_dataset_mutate.py b/docs/src/tutorial/3_dataset_mutate.py new file mode 100644 index 0000000..96b99e2 --- /dev/null +++ b/docs/src/tutorial/3_dataset_mutate.py @@ -0,0 +1,23 @@ +from pathlib import Path + +import typer +from recon.dataset import Dataset +from recon.stats import get_ner_stats + + +def main(data_file: Path, output_file: Path): + ds = Dataset("train").from_disk(data_file) + + print("STATS BEFORE") + print("============") + print(ds.apply(get_ner_stats, serialize=True)) + + ds.apply_("recon.v1.upcase_labels") + + print("STATS AFTER") + print("===========") + print(ds.apply(get_ner_stats, serialize=True)) + + +if __name__ == "__main__": + typer.run(main) \ No newline at end of file diff --git a/docs/src/tutorial/3_dataset_mutate_save.py b/docs/src/tutorial/3_dataset_mutate_save.py new file mode 100644 index 0000000..e642490 --- /dev/null +++ b/docs/src/tutorial/3_dataset_mutate_save.py @@ -0,0 +1,25 @@ +from pathlib import Path + +import typer +from recon.dataset import Dataset +from recon.stats import get_ner_stats + + +def main(data_file: Path, output_file: Path): + ds = Dataset("train").from_disk(data_file) + + print("STATS BEFORE") + print("============") + print(ds.apply(get_ner_stats, serialize=True)) + + ds.apply_("recon.v1.upcase_labels") + + print("STATS AFTER") + print("===========") + print(ds.apply(get_ner_stats, serialize=True)) + + ds.to_disk(output_file, force=True) + + +if __name__ == "__main__": + typer.run(main) \ No newline at end of file diff --git a/docs/src/tutorial/3_dataset_stats.py b/docs/src/tutorial/3_dataset_stats.py new file mode 100644 index 0000000..efbd964 --- /dev/null +++ b/docs/src/tutorial/3_dataset_stats.py @@ -0,0 +1,15 @@ +from pathlib import Path + +import typer +from recon.dataset import Dataset +from recon.stats import get_ner_stats + + +def main(data_file: Path): + data = read_jsonl(data_file) + train_stats = get_ner_stats(data) + print(get_ner_stats(data, serialize=True)) + + +if __name__ == "__main__": + typer.run(main) diff --git a/docs/tutorial/1_loading_data.md b/docs/tutorial/1_loading_data.md index 54bf19f..e1ef1d9 100644 --- a/docs/tutorial/1_loading_data.md +++ b/docs/tutorial/1_loading_data.md @@ -1,6 +1,6 @@ # Loading your data -ReconNER expects your data to be in the most basic [Prodigy Annotation Format](https://prodi.gy/docs/api-interfaces#ner). +Recon NER expects your data to be in the most basic [Prodigy Annotation Format](https://prodi.gy/docs/api-interfaces#ner). A single example in this format looks like: @@ -11,75 +11,42 @@ A single example in this format looks like: } ``` -ReconNER does require that you have the tokens property set and will try to resolve any tokenization errors in your -data for you. If your have already been tokenized (which is true if you used the ner_manual Prodigy recipe), ReconNER -will skip the tokenization step. +Recon does require that you have the tokens property set and will try to resolve any tokenization errors in your +data for you as well as add tokens if they don't already exist. If your have already been tokenized (which is true if you used the ner_manual Prodigy recipe), Recon will skip the tokenization step. -ReconNER expects your data to be in a collection in the `.jsonl` File Format. +Recon expects your data to be in a collection in a JSONL or JSON file. -## Load Corpus from_disk - -There are several utilities available for loading your data. +!!!note + More loaders for different file types (`CONLL`) will be added in future versions -The easiest way to load your data is to initialize a [Corpus](../api/corpus.md) from disk. -If you have a train/dev/test split or just train/dev files in the same directory, it's as easy as calling the `from_disk` `classmethod` for the `Corpus` object. +## Loaders -```Python -corpus = Corpus.from_disk('path/to/data_dir') +Recon comes with a few loaders, `read_jsonl` and `read_json`. They're simple enough, they just load the data from disk and create instances of the strongly typed `Example` class for each raw example. + +The `Example` class provides some basic validation that ensures all spans have a text property (which they don't if you're using newer versions of Prodigy and the ner.manual recipe for annotation). + +Everything in Recon is built to run on a single `Example` or a `List[Example]`. + +However, the goal of Recon is to provide insights across all of your annotated examples, not just one. For this, we need a wrapper around a set of examples. This is called a [`Dataset`](/api/dataset). + +Let's use the `read_jsonl` loader to load some annotated data created with Prodigy + +!!!tip + If you don't have any data available, you can use the data in the examples folder [here](https://github.com/microsoft/reconner/tree/master/examples/data/skills). We'll be using this data for the rest of the tutorial. + +```python +from recon.loaders import read_jsonl +from recon.types import Example + + +data = read_jsonl('examples/data/skills/train.jsonl') + +assert isinstance(data, Example) ``` -`Corpus.from_disk` will look in the `data_dir` you provide for a file structure that looks like: - -``` -data_dir -│ train.jsonl -│ dev.jsonl -│ test.jsonl -``` - -!!! tip - The test.jsonl file is **optional** but generally you should split your annotated data into train/dev/test files. - -## The Process of Loading Data - -While it's recommended to load data using the `Corpus.from_disk` method, you can also load data directly from disk using the `loaders.read_jsonl` and `loaders.read_json` functions. - -These functions expect the same example format (in fact, the `Corpus.from_disk` runs `loaders.read_jsonl` function) and run a few steps. - -The default pipeline that ReconNER runs when you load data is the following: - -```mermaid -graph TD - AA[File System Prodigy Format Dataset `train.jsonl`] -->|srsly.read_jsonl| A[Raw Data in Prodigy Dict format] - A -->|fix_tokenization_and_spacing| B[Raw Data with all annotations aligned to spaCy token boundaries] - B -->|add_tokens| C[Raw Data with added `tokens` property] - C -->|fix_annotations_format| D[Raw Data with fixed annotations format.] - D -->|json_to_examples| E[Strongly typed List of recon.types.Example instances ready for downstream analysis] - -``` - -### 1. Read data from disk -Loads your data with srsly using `srsly.read_jsonl` or `srsly.read_json` - -### 2. Fix Tokenization and Spacing -Fixes all annotations that are not aligned to spaCy token boundaries if possible. If examples cannot easily be fixed, the default behavior is to remove these -examples. This should rarely be the case and if this function cannot correct the Example it was likely a bad example that would confuse your model anyway. - -### 3. Add Tokens -The previous step computes token boundaries but then can alter the text of some examples to fix spacing and tokenization issues. -In this step we rerun the spaCy tokenizer and add a tokens property to the data inline with the Prodigy format. - -### 4. Fix Annotation Format -Fixes some common issues in Annotation formatting that can arise using the [`validation.fix_annotations_format`](../../api/validation/#fix_annotations_format) - -### 5. Filter Overlapping Entities -Often, you'll find your data has overlapping entities. For instance, imagine you have 2 annotators and one decided "Tesla" is a `PRODUCT` and the other noticed that the sentence is actually about "Tesla Motors" which they label as an `ORG`. This function does it's best to resolve these overlaps and in the case above would select "Tesla Motors" `ORG` as the correct entity, deleting "Tesla" `PRODUCT` from the data [`validation.filter_overlaps`](../../api/validation/#filter_overlaps) - -### 6. Load into ReconNER type system - -Finally these loaders will take a list of JSON examples in the Prodigy Annotation Format outlined above and convert it into a list of `Example` models using Pydantic +Now we have some examples to work, we can start examining our data. ## Next Steps -Once you have your data loaded, you can run other ReconNER functions on top of it to gain insights into the quality and completeness of your NER data +Once you have your data loaded, you can run other Recon functions on top of it to gain insights into the quality and completeness of your NER data as well as to start making corrections to the inconsistently annotated examples you almost certainly have (Don't worry, that's fine! Messy data is everywhere, even Microsoft) diff --git a/docs/tutorial/2_ner_stats.md b/docs/tutorial/2_ner_stats.md index c896ae0..39cb97c 100644 --- a/docs/tutorial/2_ner_stats.md +++ b/docs/tutorial/2_ner_stats.md @@ -6,11 +6,11 @@ Once you have your data loaded either by itself as a list of `Example`s or as a The `stats.get_ner_stats` function expects a `List[Example]` as it's input parameter and will return a serializable response with info about your data. Let's see how this works on the provided example data. -!!! tip + ## Example @@ -21,34 +21,51 @@ Create a file main.py with: {!./src/tutorial/1_stats.py!} ``` -Run the application with the example data. +Run the application with the example data and you should see the following results.
```console -$ python main.py ./examples/data/skills +$ python main.py ./examples/data/skills/train.jsonl { - "n_examples":102, + "n_examples":106, "n_examples_no_entities":29, - "ents_per_type":{ - "SKILL":191, - "PRODUCT":34, - "JOB_ROLE":5 - } + "n_annotations":243, + "n_annotations_per_type":{ + "SKILL":197, + "PRODUCT":33, + "JOB_ROLE":10, + "skill":2, + "product":1 + }, + "examples_with_type":null } ```
-But it isn't super helpful to have stats on **just** your training data. -And it'd be really annoying to have to call the same function on each dataset: +Great! We have some basic stats about our data but we can already see some issues. Looks like some of our examples are annotated with lowercase labels. These are obviously mistakes and we'll see how to fix these shortly. + +But first, it isn't super helpful to have stats on **just** your `train` data. +And it'd be really annoying to have to call the same function on each list of examples: ```Python -get_ner_stats(ds.train, serialize=True) -get_ner_stats(ds.dev, serialize=True) -get_ner_stats(ds.test, serialize=True) +train = read_jsonl(train_file) +print(get_ner_stats(train, serialize=True)) + +dev = read_jsonl(dev_file) +print(get_ner_stats(dev, serialize=True)) + +test = read_jsonl(test_file) +print(get_ner_stats(test, serialize=True)) ``` ## Next Steps -In the next step step of this tutorial you'll learn about how to remove the above boilerplate and run functions across your train/dev/test Dataset split. +In the next step step of this tutorial we'll introduce the core containers Recon uses for managing examples and state: + +1. [`Dataset`](/api/dataset) - A `Dataset` has a name and holds a list of examples. Its also responsible for tracking any mutations done to its internal data throught Recon operations. (More on this [later](link_to_operations)) + +and + +2. [`Corpus`](/api/corpus). A `Corpus` is a wrapper around a set of datasets that represent a typical train/eval or train/dev/test split. Using a `Corpus` allows you to gain insights on how well your train set represents your dev/test sets. \ No newline at end of file diff --git a/docs/tutorial/3_dataset_intro.md b/docs/tutorial/3_dataset_intro.md new file mode 100644 index 0000000..f62c050 --- /dev/null +++ b/docs/tutorial/3_dataset_intro.md @@ -0,0 +1,126 @@ +In Recon, a [`Dataset`](/api/dataset) has a few responsibilities. + +* Store exampels +* Store state of **every** mutation made to it using recon operations +* Provide an easy interface to apply functions and pipelines to the dataset data +* Easily serialize and deserialize from/to disk to track state of data across the duration of an annotation project + + +## Getting Started with Datasets + +The easiest way to get started with a `Dataset` is using the from_disk method. + +The following example starts by initializing a Dataset with a name ("train") and loading the train.jsonl data for the skills example dataset + +Replace the code in main.py with the following + +```Python +from pathlib import Path + +import typer +from recon.dataset import Dataset +from recon.stats import get_ner_stats + + +def main(data_file: Path): + ds = Dataset("train").from_disk(data_file) + print(get_ner_stats(data, serialize=True)) + + +if __name__ == "__main__": + typer.run(main) +``` + +and run with the same command. You should see the exact same result as you did without +using a Dataset. That's because `Dataset.from_disk` calls `read_jsonl` + +```console +$ python main.py ./examples/data/skills/train.jsonl +{ + "n_examples":106, + "n_examples_no_entities":29, + "n_annotations":243, + "n_annotations_per_type":{ + "SKILL":197, + "PRODUCT":33, + "JOB_ROLE":10, + "skill":2, + "product":1 + }, + "examples_with_type":null +} +``` + +## Applying functions to Datasets + +In the previous example we called the get_ner_stats function on the data from the train `Dataset`. +`Dataset` provides a utility function called `apply`. `Dataset.apply` takes any function that operates on a List of Examples and runs it on the Dataset's internal data. + +```Python +from pathlib import Path + +import typer +from recon.dataset import Dataset +from recon.stats import get_ner_stats + + +def main(data_file: Path): + ds = Dataset("train").from_disk(data_file) + print(ds.apply(get_ner_stats, serialize=True)) + + +if __name__ == "__main__": + typer.run(main) +``` + +This might not be that interesting (it doesn't save you a ton of code) but `Dataset.apply` can accept either a function or a name for a registered Recon operation. All functions are registered in a Recon registry. + +All functions packaged with recon have "recon.vN..." as a prefix. + +So the above example can be converted to: + +```Python +from pathlib import Path + +import typer +from recon.dataset import Dataset + + +def main(data_file: Path): + ds = Dataset("train").from_disk(data_file) + print(ds.apply("recon.v1.get_ner_stats", serialize=True)) + + +if __name__ == "__main__": + typer.run(main) +``` + +This means you don't have to import the get_ner_stats function. For a full list of operations see +the [operations API guide](/api/operations) + +All of these examples should return the exact same response. See for yourself: + +
+ +```console +$ python main.py ./examples/data/skills/train.jsonl +{ + "n_examples":106, + "n_examples_no_entities":29, + "n_annotations":243, + "n_annotations_per_type":{ + "SKILL":197, + "PRODUCT":33, + "JOB_ROLE":10, + "skill":2, + "product":1 + }, + "examples_with_type":null +} +``` +
+ +## Next Steps + +It's great that we can manage our data operations using a Dataset and named functions but our data is still messy. We still have those pesky lowercased labels for "skill" and "product" that should clearly be "SKILL" and "PRODUCT" respectively. +In the next step of the tutorial we'll learn how to run operations that mutate a `Dataset` and everything Recon does to keep track of these operations for you. diff --git a/docs/tutorial/3_dataset_mutate.md b/docs/tutorial/3_dataset_mutate.md new file mode 100644 index 0000000..e797d1b --- /dev/null +++ b/docs/tutorial/3_dataset_mutate.md @@ -0,0 +1,311 @@ +Now that we have our data managed in a Recon `Dataset`, we can make corrections to our data automatically and Recon will take care of keeping track of all operations and transformations run on our data. + +The key is the `Dataset.apply_` funciton. + +!!!tip + It's a common python convention that as far as I know was popularized by PyTorch to have a function return a value (i.e. `apply`) and a that same function name followed by an underscore (i.e. `apply_`) operate on that data inplace. + +## Correcting a Dataset + +`Dataset.apply_` requires a registered in-place operation that will run across all examples in the Dataset's data. + +Let's see an example. + +```Python hl_lines="15" +{!./src/tutorial/3_dataset_mutate.py!} +``` + +
+ +```console +$ python main.py examples/data/skills/train.jsonl + +STATS BEFORE +============ +{ + "n_examples":106, + "n_examples_no_entities":29, + "n_annotations":243, + "n_annotations_per_type":{ + "SKILL":197, + "PRODUCT":33, + "JOB_ROLE":10, + "skill":2, + "product":1 + }, + "examples_with_type":null +} +STATS AFTER +=========== +{ + "n_examples":106, + "n_examples_no_entities":29, + "n_annotations":243, + "n_annotations_per_type":{ + "SKILL":199, + "PRODUCT":34, + "JOB_ROLE":10 + }, + "examples_with_type":null +} +``` +
+ +Nice! We've easily applied the built-in "upcase_labels" function from Recon to fix our obvious mistakes. + +But that's not all... + +## Tracking operations + +It would be really easy to lose track of the operations run on our data if we ran a bunch of operations. Even with our single operation, we'd have to save a copy of the data before running the `upcase_labels` operation and drill into both versions of the dataset to identify which examples we actually changed. Recon takes care of this tracking for us. + +Let's extend our previous example by saving our new Dataset to disk using (conveniently) `Dataset.to_disk`. + + +```Python hl_lines="21" +{!./src/tutorial/3_dataset_mutate_save.py!} +``` + +
+ +```console +$ python main.py examples/data/skills/train.jsonl examples/fixed_data/skills/train.jsonl + +STATS BEFORE +============ +{ + "n_examples":106, + "n_examples_no_entities":29, + "n_annotations":243, + "n_annotations_per_type":{ + "SKILL":197, + "PRODUCT":33, + "JOB_ROLE":10, + "skill":2, + "product":1 + }, + "examples_with_type":null +} +STATS AFTER +=========== +{ + "n_examples":106, + "n_examples_no_entities":29, + "n_annotations":243, + "n_annotations_per_type":{ + "SKILL":199, + "PRODUCT":34, + "JOB_ROLE":10 + }, + "examples_with_type":null +} + +``` +
+ +This should have the same console output as before. + +Let's investigate what Recon saved. + + +```Python hl_lines="15" +{!./src/tutorial/3_dataset_mutate.py!} +``` + +
+ +```console +$ ll -a examples/fixed_data/skills + +├── .recon +├── train.jsonl + +// train.jsonl is just our serialized data from our train Dataset +// What's in .recon? + +$ tree -a examples/fixed_data/skills/ + +examples/fixed_data/skills/ +├── .recon +│   ├── example_store.jsonl +│   └── train +│   └── state.json +└── train.jsonl + +// Let's investigate the state.json for our train dataset. + +$ cat examples/fixed_data/skills/.recon/train/state.json + +{ + "name":"recon.v1.upcase_labels", + "batch":false, + "args":[], + "kwargs":{}, + "status":"COMPLETED", + "ts":1586687281, + "examples_added":0, + "examples_removed":0, + "examples_changed":3, + "transformations":[ + { + "prev_example":1923088532738022750, + "example":1401028415299739275, + "type":"EXAMPLE_CHANGED" + }, + { + "prev_example":459906967662468309, + "example":1525998324968157929, + "type":"EXAMPLE_CHANGED" + }, + { + "prev_example":200276835658424828, + "example":407710308633891847, + "type":"EXAMPLE_CHANGED" + } + ] +} +``` +
+ +In the last command above you can see the output of what Recon saves when you call `Dataset.to_disk`. + +Let's dig into the saved state a bit more. + +## Dataset state + +The first property stored is the dataset name. Pretty self-explanatory. +The second, `commit`, is a bit more complex. + +!!!tip + A core principal of Recon is that all the data types can be hashed deterministically. This means you'll get the same hash if across Python environments and sessions for each core data type including: Corpus, Dataset, Example, Span and Token. + +The `commit` property is a SHA-1 hash of the dataset +name combined with that hash of each example in the dataset. +If you're familiar with how [git](https://git-scm.com/) works the idea is pretty similar. + +The `commit` property of a dataset allows us to understand if a Dataset changes between operations. +This can happen if you add new examples and want to rerun or run new operations later based on insights from that new data. + +```json hl_lines="4 6 7 8 9 10" +{ + "name": "train", + "commit_hash": "1923088532738022750", + "operations": [ + { + "name":"recon.v1.upcase_labels", + "args":[], + "kwargs":{}, + "status":"COMPLETED", + "ts":1586687281, + "examples_added":0, + "examples_removed":0, + "examples_changed":3, + "transformations":[ + { + "prev_example":1923088532738022750, + "example":1401028415299739275, + "type":"EXAMPLE_CHANGED" + }, + { + "prev_example":459906967662468309, + "example":1525998324968157929, + "type":"EXAMPLE_CHANGED" + }, + { + "prev_example":200276835658424828, + "example":407710308633891847, + "type":"EXAMPLE_CHANGED" + } + ] + } + ] +} +``` + +The core of the stored state is the `operations` property. This operations property has all the information needed to both track and re-run an operation on a dataset. + +In the above state we have 1 operation since that's all we've run on our dataset so far. + +Each operation has a `name` (in this case `"recon.v1.upcase_labels"`) as well as any python `args` or `kwargs` run with the function. The `upcase_labels` operation has no required parameters so these are empty (we'll see some examples where these are not empty later in the tutorial). + +We also have a `status` (one of: NOT_STARTED, IN_PROGRESS, COMPLETED) and a `ts` (timestamp of when the operation was run). + +These attributes provide the base information to re-create the exact function call and provide the base information of the operation. + +The rest of the properties deal with transformation tracking. +The `examples_added`, `examples_removed`, `examples_changed` give you a summary of the overall changes by the operation. + +```json hl_lines="11 12 13" +{ + "name": "train", + "commit_hash": "1923088532738022750", + "operations": [ + { + "name":"recon.v1.upcase_labels", + "args":[], + "kwargs":{}, + "status":"COMPLETED", + "ts":1586687281, + "examples_added":0, + "examples_removed":0, + "examples_changed":3, + "transformations":[ + { + "prev_example":1923088532738022750, + "example":1401028415299739275, + "type":"EXAMPLE_CHANGED" + }, + { + "prev_example":459906967662468309, + "example":1525998324968157929, + "type":"EXAMPLE_CHANGED" + }, + { + "prev_example":200276835658424828, + "example":407710308633891847, + "type":"EXAMPLE_CHANGED" + } + ] + } + ] +} +``` + + +Finally, the `transformations` property is the most useful for actually auditing and tracking your data changes. + +```json hl_lines="14 16 17 18" +{ + "name": "train", + "commit_hash": "1923088532738022750", + "operations": [ + { + "name":"recon.v1.upcase_labels", + "args":[], + "kwargs":{}, + "status":"COMPLETED", + "ts":1586687281, + "examples_added":0, + "examples_removed":0, + "examples_changed":3, + "transformations":[ + { + "prev_example":1923088532738022750, + "example":1401028415299739275, + "type":"EXAMPLE_CHANGED" + }, + { + "prev_example":459906967662468309, + "example":1525998324968157929, + "type":"EXAMPLE_CHANGED" + }, + { + "prev_example":200276835658424828, + "example":407710308633891847, + "type":"EXAMPLE_CHANGED" + } + ] + } + ] +} +``` diff --git a/docs/tutorial/3_corpus_apply.md b/docs/tutorial/4_corpus.md similarity index 100% rename from docs/tutorial/3_corpus_apply.md rename to docs/tutorial/4_corpus.md diff --git a/docs/tutorial/4_more_stats.md b/docs/tutorial/4_more_stats.md index 4b08149..2383815 100644 --- a/docs/tutorial/4_more_stats.md +++ b/docs/tutorial/4_more_stats.md @@ -3,11 +3,6 @@ Now that we have a Corpus loaded, let's investigate the rest of the `recon.stats` module. The best way to use the `recon.stats` module is through Prodigy directly. -Recon comes packaged with sidecar Prodigy Recipes for core NER annotation workflows. - -Full Prodigy recipe map - - -* `ner.manual` => `recon.ner_manual` -* `ner.manual` => `recon.ner_correct` +!!!tip + I'm working on a dashboard for Recon to display all the statistics and distributions of NER data. These docs will be updated in depth with how to use all the stats through the dashboard. For a full reference see the [recon.stats API docs](/api/stats) \ No newline at end of file diff --git a/examples/2.0_operations.ipynb b/examples/2.0_operations.ipynb index dca02d1..c3568a8 100644 --- a/examples/2.0_operations.ipynb +++ b/examples/2.0_operations.ipynb @@ -157,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -184,6 +184,35 @@ "corpus._train.operations" ] }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"n_examples\":106,\n", + " \"n_examples_no_entities\":29,\n", + " \"n_annotations\":243,\n", + " \"n_annotations_per_type\":{\n", + " \"SKILL\":197,\n", + " \"PRODUCT\":33,\n", + " \"JOB_ROLE\":10,\n", + " \"skill\":2,\n", + " \"product\":1\n", + " },\n", + " \"examples_with_type\":null\n", + "}\n" + ] + } + ], + "source": [ + "print(get_ner_stats(corpus._train.data, serialize=True))" + ] + }, { "cell_type": "code", "execution_count": 9, diff --git a/mkdocs.yml b/mkdocs.yml index 03e3a7a..3da65e9 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -20,6 +20,9 @@ nav: - Tutorial: - Loading your data: 'tutorial/1_loading_data.md' - NER Stats: 'tutorial/2_ner_stats.md' + - Dataset: + - Introduction: 'tutorial/3_dataset_intro.md' + - Making changes to a Dataset: 'tutorial/3_dataset_mutate.md' - Using a Corpus: 'tutorial/3_corpus_apply.md' - More NER Stats: 'tutorial/4_more_stats.md' - Getting Insights: 'tutorial/5_getting_insights.md' diff --git a/recon/operations.py b/recon/operations.py index a2f5c74..5d3e2b3 100644 --- a/recon/operations.py +++ b/recon/operations.py @@ -31,6 +31,7 @@ def op_iter( Args: data (List[Example]): List of examples to iterate + pre (List[PreProcessor]): List of preprocessors to run Yields: Iterator[Tuple[int, Example]]: Tuples of (example hash, example) @@ -52,7 +53,7 @@ class operation: Args: name (str): Operation name. - batch (bool): Send all examples in dataset for batch operation. + pre (List[PreProcessor]): List of preprocessors to run """ self.name = name self.pre = pre @@ -89,7 +90,7 @@ class Operation: Args: name (str): Name of operation - batch (bool): Whether the operation handles a batch of data or not + pre (List[PreProcessor]): List of preprocessors to run op (Callable): Decorated function """ self.name = name diff --git a/recon/prodigy/ner_mirror.py b/recon/prodigy/ner_mirror.py deleted file mode 100644 index 6138260..0000000 --- a/recon/prodigy/ner_mirror.py +++ /dev/null @@ -1,133 +0,0 @@ -# isort:skip_file -# type: ignore - -from collections import Counter, defaultdict -import copy -import random -from typing import Any, Dict, Iterable, List, Optional, Union - -import catalogue -import prodigy -from prodigy.components.db import connect -from prodigy.components.loaders import get_stream -from prodigy.components.preprocess import add_tokens -from prodigy.models.matcher import PatternMatcher -from prodigy.recipes.ner import get_labels_from_ner -from prodigy.util import ( - INPUT_HASH_ATTR, - TASK_HASH_ATTR, - get_labels, - log, - set_hashes, - split_string, -) -import spacy -import srsly -from wasabi import msg -from recon.constants import NONE -from recon.loaders import read_jsonl -from recon.types import HardestExample, Example, Span -from recon.validation import remove_overlapping_entities - -import dash -import dash_core_components as dcc -import dash_html_components as html - -external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"] - -app = dash.Dash(__name__, external_stylesheets=external_stylesheets) - -app.layout = html.Div( - children=[ - html.H1(children="Hello Dash"), - html.Div( - children=""" - Dash: A web application framework for Python. - """ - ), - dcc.Graph( - id="example-graph", - figure={ - "data": [ - {"x": [1, 2, 3], "y": [4, 1, 2], "type": "bar", "name": "SF"}, - {"x": [1, 2, 3], "y": [2, 4, 5], "type": "bar", "name": "Montréal"}, - ], - "layout": {"title": "Dash Data Visualization"}, - }, - ), - ] -) - - -@prodigy.recipe( - "recon.ner_manual", - # fmt: off - dataset=("Dataset to save annotations to", "positional", None, str), - spacy_model=("Loadable spaCy model for tokenization or blank:lang (e.g. blank:en)", "positional", None, str), - source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str), - api=("DEPRECATED: API loader to use", "option", "a", str), - loader=("Loader (guessed from file extension if not set)", "option", "lo", str), - label=("Comma-separated label(s) to annotate or text file with one label per line", "option", "l", get_labels), - patterns=("Path to match patterns file", "option", "pt", str), - exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string), - # fmt: on -) -def manual( - dataset: str, - spacy_model: str, - source: Union[str, Iterable[dict]] = "-", - api: Optional[str] = None, - loader: Optional[str] = None, - label: Optional[List[str]] = None, - patterns: Optional[str] = None, - exclude: Optional[List[str]] = None, -) -> Dict[str, Any]: - """ - Mark spans by token. Requires only a tokenizer and no entity recognizer, - and doesn't do any active learning. - """ - log("RECIPE: Starting recipe ner.manual", locals()) - if spacy_model.startswith("blank:"): - nlp = spacy.blank(spacy_model.replace("blank:", "")) - else: - nlp = spacy.load(spacy_model) - labels = label # comma-separated list or path to text file - if not labels: - labels = get_labels_from_ner(nlp) - if not labels: - msg.fail("No --label argument set and no labels found in model", exits=1) - msg.text(f"Using {len(labels)} labels from model: {', '.join(labels)}") - log(f"RECIPE: Annotating with {len(labels)} labels", labels) - stream = get_stream(source, api, loader, rehash=True, dedup=True, input_key="text") - if patterns is not None: - pattern_matcher = PatternMatcher(nlp, combine_matches=True, all_examples=True) - pattern_matcher = pattern_matcher.from_disk(patterns) - stream = (eg for _, eg in pattern_matcher(stream)) - stream = add_tokens(nlp, stream) # add "tokens" key to the tasks - - print(app.index()) - - print(app.server.url_map) - import requests - - # print(app.server.view_functions['/']) - print(app.serve_layout()) - # app.run_server() - # html = requests.get('127.0.0.1:8050/').text() - - # with open('./recon/prodigy/templates/graph.html') as f: - # html = f.read() - html = "" - - return { - "view_id": "blocks", - "dataset": dataset, - "stream": stream, - "exclude": exclude, - "config": { - "lang": nlp.lang, - "labels": labels, - "exclude_by": "input", - "blocks": [{"view_id": "html", "html_template": html}, {"view_id": "ner_manual"}], - }, - } diff --git a/recon/tokenization.py b/recon/tokenization.py index ec3b0fe..480623f 100644 --- a/recon/tokenization.py +++ b/recon/tokenization.py @@ -33,12 +33,11 @@ def fix_tokenization_and_spacing( get two words pushed together where one is an entity so this can fix a lot of issues. Args: - examples (List[Example]): List of examples - tokenizer (str, optional): Name of tokenizer in tokenizers registry to use - verbose (bool, optional): Print status - + example (Example): Input Example + preprocessed_outputs (Dict[str, Any]): Outputs of preprocessors + Returns: - List[Example]: List of examples with fixed tokenization + Example: Example with spans fixed to align to token boundaries. """ doc = preprocessed_outputs["recon.v1.spacy"] @@ -141,12 +140,11 @@ def add_tokens(example: Example, *, preprocessed_outputs: Dict[str, Any]) -> Uni """Add tokens to each Example Args: - examples (List[Example]): List of examples - tokenizer (str, optional): Name of tokenizer in tokenizers registry to use - verbose (bool, optional): Print status - + example (Example): Input Example + preprocessed_outputs (Dict[str, Any]): Outputs of preprocessors + Returns: - List[Example]: List of examples with tokens + Example: Example with tokens """ doc = preprocessed_outputs["recon.v1.spacy"] @@ -166,7 +164,7 @@ def add_tokens(example: Example, *, preprocessed_outputs: Dict[str, Any]) -> Uni for span in example.spans: if span.start in token_starts and span.end in token_ends: span.token_start = token_starts[span.start].i - span.token_end = token_ends[span.end].i + span.token_end = token_ends[span.end].i + 1 if span.token_start is None or span.token_end is None: return None