зеркало из https://github.com/microsoft/reconner.git
Adding docs and fixing tokenization add_tokens to use token_end as t.i + 1 for consistency with spaCy.
This commit is contained in:
Родитель
25283c4c1f
Коммит
fb73c8e7f9
|
@ -1 +1,10 @@
|
|||
Operations are functions that operate on either a list of the examples or a single example.
|
||||
If the function operates on a single example, Recon will take care of applying it to all examples in a dataset.
|
||||
|
||||
The following operations are built into Recon
|
||||
|
||||
<!-- `recon.v1.filter_overlaps` - -->
|
||||
!!!error
|
||||
... full list of operations to come
|
||||
|
||||
::: recon.operations
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
from pathlib import Path
|
||||
|
||||
import typer
|
||||
from recon.dataset import Dataset
|
||||
from recon.loaders import read_jsonl
|
||||
from recon.stats import get_ner_stats
|
||||
|
||||
|
||||
def main(data_dir: Path):
|
||||
ds = Dataset.from_disk(data_dir)
|
||||
train_stats = get_ner_stats(ds.train)
|
||||
get_ner_stats(ds.train, serialize=True)
|
||||
def main(data_file: Path):
|
||||
data = read_jsonl(data_file)
|
||||
train_stats = get_ner_stats(data)
|
||||
print(get_ner_stats(data, serialize=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
from pathlib import Path
|
||||
|
||||
import typer
|
||||
from recon.dataset import Dataset
|
||||
from recon.stats import get_ner_stats
|
||||
|
||||
|
||||
def main(data_file: Path, output_file: Path):
|
||||
ds = Dataset("train").from_disk(data_file)
|
||||
|
||||
print("STATS BEFORE")
|
||||
print("============")
|
||||
print(ds.apply(get_ner_stats, serialize=True))
|
||||
|
||||
ds.apply_("recon.v1.upcase_labels")
|
||||
|
||||
print("STATS AFTER")
|
||||
print("===========")
|
||||
print(ds.apply(get_ner_stats, serialize=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
|
@ -0,0 +1,25 @@
|
|||
from pathlib import Path
|
||||
|
||||
import typer
|
||||
from recon.dataset import Dataset
|
||||
from recon.stats import get_ner_stats
|
||||
|
||||
|
||||
def main(data_file: Path, output_file: Path):
|
||||
ds = Dataset("train").from_disk(data_file)
|
||||
|
||||
print("STATS BEFORE")
|
||||
print("============")
|
||||
print(ds.apply(get_ner_stats, serialize=True))
|
||||
|
||||
ds.apply_("recon.v1.upcase_labels")
|
||||
|
||||
print("STATS AFTER")
|
||||
print("===========")
|
||||
print(ds.apply(get_ner_stats, serialize=True))
|
||||
|
||||
ds.to_disk(output_file, force=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
|
@ -0,0 +1,15 @@
|
|||
from pathlib import Path
|
||||
|
||||
import typer
|
||||
from recon.dataset import Dataset
|
||||
from recon.stats import get_ner_stats
|
||||
|
||||
|
||||
def main(data_file: Path):
|
||||
data = read_jsonl(data_file)
|
||||
train_stats = get_ner_stats(data)
|
||||
print(get_ner_stats(data, serialize=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
|
@ -1,6 +1,6 @@
|
|||
# Loading your data
|
||||
|
||||
ReconNER expects your data to be in the most basic [Prodigy Annotation Format](https://prodi.gy/docs/api-interfaces#ner).
|
||||
Recon NER expects your data to be in the most basic [Prodigy Annotation Format](https://prodi.gy/docs/api-interfaces#ner).
|
||||
|
||||
A single example in this format looks like:
|
||||
|
||||
|
@ -11,75 +11,42 @@ A single example in this format looks like:
|
|||
}
|
||||
```
|
||||
|
||||
ReconNER does require that you have the tokens property set and will try to resolve any tokenization errors in your
|
||||
data for you. If your have already been tokenized (which is true if you used the ner_manual Prodigy recipe), ReconNER
|
||||
will skip the tokenization step.
|
||||
Recon does require that you have the tokens property set and will try to resolve any tokenization errors in your
|
||||
data for you as well as add tokens if they don't already exist. If your have already been tokenized (which is true if you used the ner_manual Prodigy recipe), Recon will skip the tokenization step.
|
||||
|
||||
ReconNER expects your data to be in a collection in the `.jsonl` File Format.
|
||||
Recon expects your data to be in a collection in a JSONL or JSON file.
|
||||
|
||||
## Load Corpus from_disk
|
||||
|
||||
There are several utilities available for loading your data.
|
||||
!!!note
|
||||
More loaders for different file types (`CONLL`) will be added in future versions
|
||||
|
||||
|
||||
The easiest way to load your data is to initialize a [Corpus](../api/corpus.md) from disk.
|
||||
If you have a train/dev/test split or just train/dev files in the same directory, it's as easy as calling the `from_disk` `classmethod` for the `Corpus` object.
|
||||
## Loaders
|
||||
|
||||
```Python
|
||||
corpus = Corpus.from_disk('path/to/data_dir')
|
||||
Recon comes with a few loaders, `read_jsonl` and `read_json`. They're simple enough, they just load the data from disk and create instances of the strongly typed `Example` class for each raw example.
|
||||
|
||||
The `Example` class provides some basic validation that ensures all spans have a text property (which they don't if you're using newer versions of Prodigy and the ner.manual recipe for annotation).
|
||||
|
||||
Everything in Recon is built to run on a single `Example` or a `List[Example]`.
|
||||
|
||||
However, the goal of Recon is to provide insights across all of your annotated examples, not just one. For this, we need a wrapper around a set of examples. This is called a [`Dataset`](/api/dataset).
|
||||
|
||||
Let's use the `read_jsonl` loader to load some annotated data created with Prodigy
|
||||
|
||||
!!!tip
|
||||
If you don't have any data available, you can use the data in the examples folder [here](https://github.com/microsoft/reconner/tree/master/examples/data/skills). We'll be using this data for the rest of the tutorial.
|
||||
|
||||
```python
|
||||
from recon.loaders import read_jsonl
|
||||
from recon.types import Example
|
||||
|
||||
|
||||
data = read_jsonl('examples/data/skills/train.jsonl')
|
||||
|
||||
assert isinstance(data, Example)
|
||||
```
|
||||
|
||||
`Corpus.from_disk` will look in the `data_dir` you provide for a file structure that looks like:
|
||||
|
||||
```
|
||||
data_dir
|
||||
│ train.jsonl
|
||||
│ dev.jsonl
|
||||
│ test.jsonl
|
||||
```
|
||||
|
||||
!!! tip
|
||||
The test.jsonl file is **optional** but generally you should split your annotated data into train/dev/test files.
|
||||
|
||||
## The Process of Loading Data
|
||||
|
||||
While it's recommended to load data using the `Corpus.from_disk` method, you can also load data directly from disk using the `loaders.read_jsonl` and `loaders.read_json` functions.
|
||||
|
||||
These functions expect the same example format (in fact, the `Corpus.from_disk` runs `loaders.read_jsonl` function) and run a few steps.
|
||||
|
||||
The default pipeline that ReconNER runs when you load data is the following:
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
AA[File System Prodigy Format Dataset `train.jsonl`] -->|srsly.read_jsonl| A[Raw Data in Prodigy Dict format]
|
||||
A -->|fix_tokenization_and_spacing| B[Raw Data with all annotations aligned to spaCy token boundaries]
|
||||
B -->|add_tokens| C[Raw Data with added `tokens` property]
|
||||
C -->|fix_annotations_format| D[Raw Data with fixed annotations format.]
|
||||
D -->|json_to_examples| E[Strongly typed List of recon.types.Example instances ready for downstream analysis]
|
||||
|
||||
```
|
||||
|
||||
### 1. Read data from disk
|
||||
Loads your data with <a href="https://github.com/explosion/srsly" class="external-link" target="_blank">srsly</a> using `srsly.read_jsonl` or `srsly.read_json`
|
||||
|
||||
### 2. Fix Tokenization and Spacing
|
||||
Fixes all annotations that are not aligned to spaCy token boundaries if possible. If examples cannot easily be fixed, the default behavior is to remove these
|
||||
examples. This should rarely be the case and if this function cannot correct the Example it was likely a bad example that would confuse your model anyway.
|
||||
|
||||
### 3. Add Tokens
|
||||
The previous step computes token boundaries but then can alter the text of some examples to fix spacing and tokenization issues.
|
||||
In this step we rerun the spaCy tokenizer and add a tokens property to the data inline with the Prodigy format.
|
||||
|
||||
### 4. Fix Annotation Format
|
||||
Fixes some common issues in Annotation formatting that can arise using the [`validation.fix_annotations_format`](../../api/validation/#fix_annotations_format)
|
||||
|
||||
### 5. Filter Overlapping Entities
|
||||
Often, you'll find your data has overlapping entities. For instance, imagine you have 2 annotators and one decided "Tesla" is a `PRODUCT` and the other noticed that the sentence is actually about "Tesla Motors" which they label as an `ORG`. This function does it's best to resolve these overlaps and in the case above would select "Tesla Motors" `ORG` as the correct entity, deleting "Tesla" `PRODUCT` from the data [`validation.filter_overlaps`](../../api/validation/#filter_overlaps)
|
||||
|
||||
### 6. Load into ReconNER type system
|
||||
|
||||
Finally these loaders will take a list of JSON examples in the Prodigy Annotation Format outlined above and convert it into a list of `Example` models using <a href="https://pydantic-docs.helpmanual.io/" class="external-link" target="_blank">Pydantic</a>
|
||||
Now we have some examples to work, we can start examining our data.
|
||||
|
||||
## Next Steps
|
||||
|
||||
Once you have your data loaded, you can run other ReconNER functions on top of it to gain insights into the quality and completeness of your NER data
|
||||
Once you have your data loaded, you can run other Recon functions on top of it to gain insights into the quality and completeness of your NER data as well as to start making corrections to the inconsistently annotated examples you almost certainly have (Don't worry, that's fine! Messy data is everywhere, even Microsoft)
|
||||
|
|
|
@ -6,11 +6,11 @@ Once you have your data loaded either by itself as a list of `Example`s or as a
|
|||
|
||||
The `stats.get_ner_stats` function expects a `List[Example]` as it's input parameter and will return a serializable response with info about your data. Let's see how this works on the provided example data.
|
||||
|
||||
!!! tip
|
||||
<!-- !!! tip
|
||||
If you don't already have the example data or a dataset of your own, you can download it now. Open a terminal and run the `download` command.
|
||||
```console
|
||||
$ recon download examples ./data
|
||||
```
|
||||
``` -->
|
||||
|
||||
|
||||
## Example
|
||||
|
@ -21,34 +21,51 @@ Create a file main.py with:
|
|||
{!./src/tutorial/1_stats.py!}
|
||||
```
|
||||
|
||||
Run the application with the example data.
|
||||
Run the application with the example data and you should see the following results.
|
||||
|
||||
<div class="termy">
|
||||
|
||||
```console
|
||||
$ python main.py ./examples/data/skills
|
||||
$ python main.py ./examples/data/skills/train.jsonl
|
||||
{
|
||||
"n_examples":102,
|
||||
"n_examples":106,
|
||||
"n_examples_no_entities":29,
|
||||
"ents_per_type":{
|
||||
"SKILL":191,
|
||||
"PRODUCT":34,
|
||||
"JOB_ROLE":5
|
||||
}
|
||||
"n_annotations":243,
|
||||
"n_annotations_per_type":{
|
||||
"SKILL":197,
|
||||
"PRODUCT":33,
|
||||
"JOB_ROLE":10,
|
||||
"skill":2,
|
||||
"product":1
|
||||
},
|
||||
"examples_with_type":null
|
||||
}
|
||||
```
|
||||
|
||||
</div>
|
||||
|
||||
But it isn't super helpful to have stats on **just** your training data.
|
||||
And it'd be really annoying to have to call the same function on each dataset:
|
||||
Great! We have some basic stats about our data but we can already see some issues. Looks like some of our examples are annotated with lowercase labels. These are obviously mistakes and we'll see how to fix these shortly.
|
||||
|
||||
But first, it isn't super helpful to have stats on **just** your `train` data.
|
||||
And it'd be really annoying to have to call the same function on each list of examples:
|
||||
|
||||
```Python
|
||||
get_ner_stats(ds.train, serialize=True)
|
||||
get_ner_stats(ds.dev, serialize=True)
|
||||
get_ner_stats(ds.test, serialize=True)
|
||||
train = read_jsonl(train_file)
|
||||
print(get_ner_stats(train, serialize=True))
|
||||
|
||||
dev = read_jsonl(dev_file)
|
||||
print(get_ner_stats(dev, serialize=True))
|
||||
|
||||
test = read_jsonl(test_file)
|
||||
print(get_ner_stats(test, serialize=True))
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
In the next step step of this tutorial you'll learn about how to remove the above boilerplate and run functions across your train/dev/test Dataset split.
|
||||
In the next step step of this tutorial we'll introduce the core containers Recon uses for managing examples and state:
|
||||
|
||||
1. [`Dataset`](/api/dataset) - A `Dataset` has a name and holds a list of examples. Its also responsible for tracking any mutations done to its internal data throught Recon operations. (More on this [later](link_to_operations))
|
||||
|
||||
and
|
||||
|
||||
2. [`Corpus`](/api/corpus). A `Corpus` is a wrapper around a set of datasets that represent a typical train/eval or train/dev/test split. Using a `Corpus` allows you to gain insights on how well your train set represents your dev/test sets.
|
|
@ -0,0 +1,126 @@
|
|||
In Recon, a [`Dataset`](/api/dataset) has a few responsibilities.
|
||||
|
||||
* Store exampels
|
||||
* Store state of **every** mutation made to it using recon operations
|
||||
* Provide an easy interface to apply functions and pipelines to the dataset data
|
||||
* Easily serialize and deserialize from/to disk to track state of data across the duration of an annotation project
|
||||
|
||||
|
||||
## Getting Started with Datasets
|
||||
|
||||
The easiest way to get started with a `Dataset` is using the from_disk method.
|
||||
|
||||
The following example starts by initializing a Dataset with a name ("train") and loading the train.jsonl data for the skills example dataset
|
||||
|
||||
Replace the code in main.py with the following
|
||||
|
||||
```Python
|
||||
from pathlib import Path
|
||||
|
||||
import typer
|
||||
from recon.dataset import Dataset
|
||||
from recon.stats import get_ner_stats
|
||||
|
||||
|
||||
def main(data_file: Path):
|
||||
ds = Dataset("train").from_disk(data_file)
|
||||
print(get_ner_stats(data, serialize=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
||||
```
|
||||
|
||||
and run with the same command. You should see the exact same result as you did without
|
||||
using a Dataset. That's because `Dataset.from_disk` calls `read_jsonl`
|
||||
|
||||
```console
|
||||
$ python main.py ./examples/data/skills/train.jsonl
|
||||
{
|
||||
"n_examples":106,
|
||||
"n_examples_no_entities":29,
|
||||
"n_annotations":243,
|
||||
"n_annotations_per_type":{
|
||||
"SKILL":197,
|
||||
"PRODUCT":33,
|
||||
"JOB_ROLE":10,
|
||||
"skill":2,
|
||||
"product":1
|
||||
},
|
||||
"examples_with_type":null
|
||||
}
|
||||
```
|
||||
|
||||
## Applying functions to Datasets
|
||||
|
||||
In the previous example we called the get_ner_stats function on the data from the train `Dataset`.
|
||||
`Dataset` provides a utility function called `apply`. `Dataset.apply` takes any function that operates on a List of Examples and runs it on the Dataset's internal data.
|
||||
|
||||
```Python
|
||||
from pathlib import Path
|
||||
|
||||
import typer
|
||||
from recon.dataset import Dataset
|
||||
from recon.stats import get_ner_stats
|
||||
|
||||
|
||||
def main(data_file: Path):
|
||||
ds = Dataset("train").from_disk(data_file)
|
||||
print(ds.apply(get_ner_stats, serialize=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
||||
```
|
||||
|
||||
This might not be that interesting (it doesn't save you a ton of code) but `Dataset.apply` can accept either a function or a name for a registered Recon operation. All functions are registered in a Recon registry.
|
||||
|
||||
All functions packaged with recon have "recon.vN..." as a prefix.
|
||||
|
||||
So the above example can be converted to:
|
||||
|
||||
```Python
|
||||
from pathlib import Path
|
||||
|
||||
import typer
|
||||
from recon.dataset import Dataset
|
||||
|
||||
|
||||
def main(data_file: Path):
|
||||
ds = Dataset("train").from_disk(data_file)
|
||||
print(ds.apply("recon.v1.get_ner_stats", serialize=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
||||
```
|
||||
|
||||
This means you don't have to import the get_ner_stats function. For a full list of operations see
|
||||
the [operations API guide](/api/operations)
|
||||
|
||||
All of these examples should return the exact same response. See for yourself:
|
||||
|
||||
<div class="termy">
|
||||
|
||||
```console
|
||||
$ python main.py ./examples/data/skills/train.jsonl
|
||||
{
|
||||
"n_examples":106,
|
||||
"n_examples_no_entities":29,
|
||||
"n_annotations":243,
|
||||
"n_annotations_per_type":{
|
||||
"SKILL":197,
|
||||
"PRODUCT":33,
|
||||
"JOB_ROLE":10,
|
||||
"skill":2,
|
||||
"product":1
|
||||
},
|
||||
"examples_with_type":null
|
||||
}
|
||||
```
|
||||
</div>
|
||||
|
||||
## Next Steps
|
||||
|
||||
It's great that we can manage our data operations using a Dataset and named functions but our data is still messy. We still have those pesky lowercased labels for "skill" and "product" that should clearly be "SKILL" and "PRODUCT" respectively.
|
||||
In the next step of the tutorial we'll learn how to run operations that mutate a `Dataset` and everything Recon does to keep track of these operations for you.
|
|
@ -0,0 +1,311 @@
|
|||
Now that we have our data managed in a Recon `Dataset`, we can make corrections to our data automatically and Recon will take care of keeping track of all operations and transformations run on our data.
|
||||
|
||||
The key is the `Dataset.apply_` funciton.
|
||||
|
||||
!!!tip
|
||||
It's a common python convention that as far as I know was popularized by PyTorch to have a function return a value (i.e. `apply`) and a that same function name followed by an underscore (i.e. `apply_`) operate on that data inplace.
|
||||
|
||||
## Correcting a Dataset
|
||||
|
||||
`Dataset.apply_` requires a registered in-place operation that will run across all examples in the Dataset's data.
|
||||
|
||||
Let's see an example.
|
||||
|
||||
```Python hl_lines="15"
|
||||
{!./src/tutorial/3_dataset_mutate.py!}
|
||||
```
|
||||
|
||||
<div class="termy">
|
||||
|
||||
```console
|
||||
$ python main.py examples/data/skills/train.jsonl
|
||||
|
||||
STATS BEFORE
|
||||
============
|
||||
{
|
||||
"n_examples":106,
|
||||
"n_examples_no_entities":29,
|
||||
"n_annotations":243,
|
||||
"n_annotations_per_type":{
|
||||
"SKILL":197,
|
||||
"PRODUCT":33,
|
||||
"JOB_ROLE":10,
|
||||
"skill":2,
|
||||
"product":1
|
||||
},
|
||||
"examples_with_type":null
|
||||
}
|
||||
STATS AFTER
|
||||
===========
|
||||
{
|
||||
"n_examples":106,
|
||||
"n_examples_no_entities":29,
|
||||
"n_annotations":243,
|
||||
"n_annotations_per_type":{
|
||||
"SKILL":199,
|
||||
"PRODUCT":34,
|
||||
"JOB_ROLE":10
|
||||
},
|
||||
"examples_with_type":null
|
||||
}
|
||||
```
|
||||
</div>
|
||||
|
||||
Nice! We've easily applied the built-in "upcase_labels" function from Recon to fix our obvious mistakes.
|
||||
|
||||
But that's not all...
|
||||
|
||||
## Tracking operations
|
||||
|
||||
It would be really easy to lose track of the operations run on our data if we ran a bunch of operations. Even with our single operation, we'd have to save a copy of the data before running the `upcase_labels` operation and drill into both versions of the dataset to identify which examples we actually changed. Recon takes care of this tracking for us.
|
||||
|
||||
Let's extend our previous example by saving our new Dataset to disk using (conveniently) `Dataset.to_disk`.
|
||||
|
||||
|
||||
```Python hl_lines="21"
|
||||
{!./src/tutorial/3_dataset_mutate_save.py!}
|
||||
```
|
||||
|
||||
<div class="termy">
|
||||
|
||||
```console
|
||||
$ python main.py examples/data/skills/train.jsonl examples/fixed_data/skills/train.jsonl
|
||||
|
||||
STATS BEFORE
|
||||
============
|
||||
{
|
||||
"n_examples":106,
|
||||
"n_examples_no_entities":29,
|
||||
"n_annotations":243,
|
||||
"n_annotations_per_type":{
|
||||
"SKILL":197,
|
||||
"PRODUCT":33,
|
||||
"JOB_ROLE":10,
|
||||
"skill":2,
|
||||
"product":1
|
||||
},
|
||||
"examples_with_type":null
|
||||
}
|
||||
STATS AFTER
|
||||
===========
|
||||
{
|
||||
"n_examples":106,
|
||||
"n_examples_no_entities":29,
|
||||
"n_annotations":243,
|
||||
"n_annotations_per_type":{
|
||||
"SKILL":199,
|
||||
"PRODUCT":34,
|
||||
"JOB_ROLE":10
|
||||
},
|
||||
"examples_with_type":null
|
||||
}
|
||||
|
||||
```
|
||||
</div>
|
||||
|
||||
This should have the same console output as before.
|
||||
|
||||
Let's investigate what Recon saved.
|
||||
|
||||
|
||||
```Python hl_lines="15"
|
||||
{!./src/tutorial/3_dataset_mutate.py!}
|
||||
```
|
||||
|
||||
<div class="termy">
|
||||
|
||||
```console
|
||||
$ ll -a examples/fixed_data/skills
|
||||
|
||||
├── .recon
|
||||
├── train.jsonl
|
||||
|
||||
// train.jsonl is just our serialized data from our train Dataset
|
||||
// What's in .recon?
|
||||
|
||||
$ tree -a examples/fixed_data/skills/
|
||||
|
||||
examples/fixed_data/skills/
|
||||
├── .recon
|
||||
│ ├── example_store.jsonl
|
||||
│ └── train
|
||||
│ └── state.json
|
||||
└── train.jsonl
|
||||
|
||||
// Let's investigate the state.json for our train dataset.
|
||||
|
||||
$ cat examples/fixed_data/skills/.recon/train/state.json
|
||||
|
||||
{
|
||||
"name":"recon.v1.upcase_labels",
|
||||
"batch":false,
|
||||
"args":[],
|
||||
"kwargs":{},
|
||||
"status":"COMPLETED",
|
||||
"ts":1586687281,
|
||||
"examples_added":0,
|
||||
"examples_removed":0,
|
||||
"examples_changed":3,
|
||||
"transformations":[
|
||||
{
|
||||
"prev_example":1923088532738022750,
|
||||
"example":1401028415299739275,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
},
|
||||
{
|
||||
"prev_example":459906967662468309,
|
||||
"example":1525998324968157929,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
},
|
||||
{
|
||||
"prev_example":200276835658424828,
|
||||
"example":407710308633891847,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
</div>
|
||||
|
||||
In the last command above you can see the output of what Recon saves when you call `Dataset.to_disk`.
|
||||
|
||||
Let's dig into the saved state a bit more.
|
||||
|
||||
## Dataset state
|
||||
|
||||
The first property stored is the dataset name. Pretty self-explanatory.
|
||||
The second, `commit`, is a bit more complex.
|
||||
|
||||
!!!tip
|
||||
A core principal of Recon is that all the data types can be hashed deterministically. This means you'll get the same hash if across Python environments and sessions for each core data type including: Corpus, Dataset, Example, Span and Token.
|
||||
|
||||
The `commit` property is a SHA-1 hash of the dataset
|
||||
name combined with that hash of each example in the dataset.
|
||||
If you're familiar with how [git](https://git-scm.com/) works the idea is pretty similar.
|
||||
|
||||
The `commit` property of a dataset allows us to understand if a Dataset changes between operations.
|
||||
This can happen if you add new examples and want to rerun or run new operations later based on insights from that new data.
|
||||
|
||||
```json hl_lines="4 6 7 8 9 10"
|
||||
{
|
||||
"name": "train",
|
||||
"commit_hash": "1923088532738022750",
|
||||
"operations": [
|
||||
{
|
||||
"name":"recon.v1.upcase_labels",
|
||||
"args":[],
|
||||
"kwargs":{},
|
||||
"status":"COMPLETED",
|
||||
"ts":1586687281,
|
||||
"examples_added":0,
|
||||
"examples_removed":0,
|
||||
"examples_changed":3,
|
||||
"transformations":[
|
||||
{
|
||||
"prev_example":1923088532738022750,
|
||||
"example":1401028415299739275,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
},
|
||||
{
|
||||
"prev_example":459906967662468309,
|
||||
"example":1525998324968157929,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
},
|
||||
{
|
||||
"prev_example":200276835658424828,
|
||||
"example":407710308633891847,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The core of the stored state is the `operations` property. This operations property has all the information needed to both track and re-run an operation on a dataset.
|
||||
|
||||
In the above state we have 1 operation since that's all we've run on our dataset so far.
|
||||
|
||||
Each operation has a `name` (in this case `"recon.v1.upcase_labels"`) as well as any python `args` or `kwargs` run with the function. The `upcase_labels` operation has no required parameters so these are empty (we'll see some examples where these are not empty later in the tutorial).
|
||||
|
||||
We also have a `status` (one of: NOT_STARTED, IN_PROGRESS, COMPLETED) and a `ts` (timestamp of when the operation was run).
|
||||
|
||||
These attributes provide the base information to re-create the exact function call and provide the base information of the operation.
|
||||
|
||||
The rest of the properties deal with transformation tracking.
|
||||
The `examples_added`, `examples_removed`, `examples_changed` give you a summary of the overall changes by the operation.
|
||||
|
||||
```json hl_lines="11 12 13"
|
||||
{
|
||||
"name": "train",
|
||||
"commit_hash": "1923088532738022750",
|
||||
"operations": [
|
||||
{
|
||||
"name":"recon.v1.upcase_labels",
|
||||
"args":[],
|
||||
"kwargs":{},
|
||||
"status":"COMPLETED",
|
||||
"ts":1586687281,
|
||||
"examples_added":0,
|
||||
"examples_removed":0,
|
||||
"examples_changed":3,
|
||||
"transformations":[
|
||||
{
|
||||
"prev_example":1923088532738022750,
|
||||
"example":1401028415299739275,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
},
|
||||
{
|
||||
"prev_example":459906967662468309,
|
||||
"example":1525998324968157929,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
},
|
||||
{
|
||||
"prev_example":200276835658424828,
|
||||
"example":407710308633891847,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
Finally, the `transformations` property is the most useful for actually auditing and tracking your data changes.
|
||||
|
||||
```json hl_lines="14 16 17 18"
|
||||
{
|
||||
"name": "train",
|
||||
"commit_hash": "1923088532738022750",
|
||||
"operations": [
|
||||
{
|
||||
"name":"recon.v1.upcase_labels",
|
||||
"args":[],
|
||||
"kwargs":{},
|
||||
"status":"COMPLETED",
|
||||
"ts":1586687281,
|
||||
"examples_added":0,
|
||||
"examples_removed":0,
|
||||
"examples_changed":3,
|
||||
"transformations":[
|
||||
{
|
||||
"prev_example":1923088532738022750,
|
||||
"example":1401028415299739275,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
},
|
||||
{
|
||||
"prev_example":459906967662468309,
|
||||
"example":1525998324968157929,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
},
|
||||
{
|
||||
"prev_example":200276835658424828,
|
||||
"example":407710308633891847,
|
||||
"type":"EXAMPLE_CHANGED"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
|
@ -3,11 +3,6 @@
|
|||
Now that we have a Corpus loaded, let's investigate the rest of the `recon.stats` module.
|
||||
The best way to use the `recon.stats` module is through Prodigy directly.
|
||||
|
||||
Recon comes packaged with sidecar Prodigy Recipes for core NER annotation workflows.
|
||||
|
||||
|
||||
Full Prodigy recipe map
|
||||
|
||||
|
||||
* `ner.manual` => `recon.ner_manual`
|
||||
* `ner.manual` => `recon.ner_correct`
|
||||
!!!tip
|
||||
I'm working on a dashboard for Recon to display all the statistics and distributions of NER data. These docs will be updated in depth with how to use all the stats through the dashboard. For a full reference see the [recon.stats API docs](/api/stats)
|
|
@ -157,7 +157,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -184,6 +184,35 @@
|
|||
"corpus._train.operations"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{\n",
|
||||
" \"n_examples\":106,\n",
|
||||
" \"n_examples_no_entities\":29,\n",
|
||||
" \"n_annotations\":243,\n",
|
||||
" \"n_annotations_per_type\":{\n",
|
||||
" \"SKILL\":197,\n",
|
||||
" \"PRODUCT\":33,\n",
|
||||
" \"JOB_ROLE\":10,\n",
|
||||
" \"skill\":2,\n",
|
||||
" \"product\":1\n",
|
||||
" },\n",
|
||||
" \"examples_with_type\":null\n",
|
||||
"}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(get_ner_stats(corpus._train.data, serialize=True))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
|
|
|
@ -20,6 +20,9 @@ nav:
|
|||
- Tutorial:
|
||||
- Loading your data: 'tutorial/1_loading_data.md'
|
||||
- NER Stats: 'tutorial/2_ner_stats.md'
|
||||
- Dataset:
|
||||
- Introduction: 'tutorial/3_dataset_intro.md'
|
||||
- Making changes to a Dataset: 'tutorial/3_dataset_mutate.md'
|
||||
- Using a Corpus: 'tutorial/3_corpus_apply.md'
|
||||
- More NER Stats: 'tutorial/4_more_stats.md'
|
||||
- Getting Insights: 'tutorial/5_getting_insights.md'
|
||||
|
|
|
@ -31,6 +31,7 @@ def op_iter(
|
|||
|
||||
Args:
|
||||
data (List[Example]): List of examples to iterate
|
||||
pre (List[PreProcessor]): List of preprocessors to run
|
||||
|
||||
Yields:
|
||||
Iterator[Tuple[int, Example]]: Tuples of (example hash, example)
|
||||
|
@ -52,7 +53,7 @@ class operation:
|
|||
|
||||
Args:
|
||||
name (str): Operation name.
|
||||
batch (bool): Send all examples in dataset for batch operation.
|
||||
pre (List[PreProcessor]): List of preprocessors to run
|
||||
"""
|
||||
self.name = name
|
||||
self.pre = pre
|
||||
|
@ -89,7 +90,7 @@ class Operation:
|
|||
|
||||
Args:
|
||||
name (str): Name of operation
|
||||
batch (bool): Whether the operation handles a batch of data or not
|
||||
pre (List[PreProcessor]): List of preprocessors to run
|
||||
op (Callable): Decorated function
|
||||
"""
|
||||
self.name = name
|
||||
|
|
|
@ -1,133 +0,0 @@
|
|||
# isort:skip_file
|
||||
# type: ignore
|
||||
|
||||
from collections import Counter, defaultdict
|
||||
import copy
|
||||
import random
|
||||
from typing import Any, Dict, Iterable, List, Optional, Union
|
||||
|
||||
import catalogue
|
||||
import prodigy
|
||||
from prodigy.components.db import connect
|
||||
from prodigy.components.loaders import get_stream
|
||||
from prodigy.components.preprocess import add_tokens
|
||||
from prodigy.models.matcher import PatternMatcher
|
||||
from prodigy.recipes.ner import get_labels_from_ner
|
||||
from prodigy.util import (
|
||||
INPUT_HASH_ATTR,
|
||||
TASK_HASH_ATTR,
|
||||
get_labels,
|
||||
log,
|
||||
set_hashes,
|
||||
split_string,
|
||||
)
|
||||
import spacy
|
||||
import srsly
|
||||
from wasabi import msg
|
||||
from recon.constants import NONE
|
||||
from recon.loaders import read_jsonl
|
||||
from recon.types import HardestExample, Example, Span
|
||||
from recon.validation import remove_overlapping_entities
|
||||
|
||||
import dash
|
||||
import dash_core_components as dcc
|
||||
import dash_html_components as html
|
||||
|
||||
external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"]
|
||||
|
||||
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
|
||||
|
||||
app.layout = html.Div(
|
||||
children=[
|
||||
html.H1(children="Hello Dash"),
|
||||
html.Div(
|
||||
children="""
|
||||
Dash: A web application framework for Python.
|
||||
"""
|
||||
),
|
||||
dcc.Graph(
|
||||
id="example-graph",
|
||||
figure={
|
||||
"data": [
|
||||
{"x": [1, 2, 3], "y": [4, 1, 2], "type": "bar", "name": "SF"},
|
||||
{"x": [1, 2, 3], "y": [2, 4, 5], "type": "bar", "name": "Montréal"},
|
||||
],
|
||||
"layout": {"title": "Dash Data Visualization"},
|
||||
},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@prodigy.recipe(
|
||||
"recon.ner_manual",
|
||||
# fmt: off
|
||||
dataset=("Dataset to save annotations to", "positional", None, str),
|
||||
spacy_model=("Loadable spaCy model for tokenization or blank:lang (e.g. blank:en)", "positional", None, str),
|
||||
source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
|
||||
api=("DEPRECATED: API loader to use", "option", "a", str),
|
||||
loader=("Loader (guessed from file extension if not set)", "option", "lo", str),
|
||||
label=("Comma-separated label(s) to annotate or text file with one label per line", "option", "l", get_labels),
|
||||
patterns=("Path to match patterns file", "option", "pt", str),
|
||||
exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string),
|
||||
# fmt: on
|
||||
)
|
||||
def manual(
|
||||
dataset: str,
|
||||
spacy_model: str,
|
||||
source: Union[str, Iterable[dict]] = "-",
|
||||
api: Optional[str] = None,
|
||||
loader: Optional[str] = None,
|
||||
label: Optional[List[str]] = None,
|
||||
patterns: Optional[str] = None,
|
||||
exclude: Optional[List[str]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Mark spans by token. Requires only a tokenizer and no entity recognizer,
|
||||
and doesn't do any active learning.
|
||||
"""
|
||||
log("RECIPE: Starting recipe ner.manual", locals())
|
||||
if spacy_model.startswith("blank:"):
|
||||
nlp = spacy.blank(spacy_model.replace("blank:", ""))
|
||||
else:
|
||||
nlp = spacy.load(spacy_model)
|
||||
labels = label # comma-separated list or path to text file
|
||||
if not labels:
|
||||
labels = get_labels_from_ner(nlp)
|
||||
if not labels:
|
||||
msg.fail("No --label argument set and no labels found in model", exits=1)
|
||||
msg.text(f"Using {len(labels)} labels from model: {', '.join(labels)}")
|
||||
log(f"RECIPE: Annotating with {len(labels)} labels", labels)
|
||||
stream = get_stream(source, api, loader, rehash=True, dedup=True, input_key="text")
|
||||
if patterns is not None:
|
||||
pattern_matcher = PatternMatcher(nlp, combine_matches=True, all_examples=True)
|
||||
pattern_matcher = pattern_matcher.from_disk(patterns)
|
||||
stream = (eg for _, eg in pattern_matcher(stream))
|
||||
stream = add_tokens(nlp, stream) # add "tokens" key to the tasks
|
||||
|
||||
print(app.index())
|
||||
|
||||
print(app.server.url_map)
|
||||
import requests
|
||||
|
||||
# print(app.server.view_functions['/'])
|
||||
print(app.serve_layout())
|
||||
# app.run_server()
|
||||
# html = requests.get('127.0.0.1:8050/').text()
|
||||
|
||||
# with open('./recon/prodigy/templates/graph.html') as f:
|
||||
# html = f.read()
|
||||
html = ""
|
||||
|
||||
return {
|
||||
"view_id": "blocks",
|
||||
"dataset": dataset,
|
||||
"stream": stream,
|
||||
"exclude": exclude,
|
||||
"config": {
|
||||
"lang": nlp.lang,
|
||||
"labels": labels,
|
||||
"exclude_by": "input",
|
||||
"blocks": [{"view_id": "html", "html_template": html}, {"view_id": "ner_manual"}],
|
||||
},
|
||||
}
|
|
@ -33,12 +33,11 @@ def fix_tokenization_and_spacing(
|
|||
get two words pushed together where one is an entity so this can fix a lot of issues.
|
||||
|
||||
Args:
|
||||
examples (List[Example]): List of examples
|
||||
tokenizer (str, optional): Name of tokenizer in tokenizers registry to use
|
||||
verbose (bool, optional): Print status
|
||||
|
||||
example (Example): Input Example
|
||||
preprocessed_outputs (Dict[str, Any]): Outputs of preprocessors
|
||||
|
||||
Returns:
|
||||
List[Example]: List of examples with fixed tokenization
|
||||
Example: Example with spans fixed to align to token boundaries.
|
||||
"""
|
||||
|
||||
doc = preprocessed_outputs["recon.v1.spacy"]
|
||||
|
@ -141,12 +140,11 @@ def add_tokens(example: Example, *, preprocessed_outputs: Dict[str, Any]) -> Uni
|
|||
"""Add tokens to each Example
|
||||
|
||||
Args:
|
||||
examples (List[Example]): List of examples
|
||||
tokenizer (str, optional): Name of tokenizer in tokenizers registry to use
|
||||
verbose (bool, optional): Print status
|
||||
|
||||
example (Example): Input Example
|
||||
preprocessed_outputs (Dict[str, Any]): Outputs of preprocessors
|
||||
|
||||
Returns:
|
||||
List[Example]: List of examples with tokens
|
||||
Example: Example with tokens
|
||||
"""
|
||||
doc = preprocessed_outputs["recon.v1.spacy"]
|
||||
|
||||
|
@ -166,7 +164,7 @@ def add_tokens(example: Example, *, preprocessed_outputs: Dict[str, Any]) -> Uni
|
|||
for span in example.spans:
|
||||
if span.start in token_starts and span.end in token_ends:
|
||||
span.token_start = token_starts[span.start].i
|
||||
span.token_end = token_ends[span.end].i
|
||||
span.token_end = token_ends[span.end].i + 1
|
||||
|
||||
if span.token_start is None or span.token_end is None:
|
||||
return None
|
||||
|
|
Загрузка…
Ссылка в новой задаче