зеркало из https://github.com/microsoft/reconner.git
format and add type def + docstring for split_sentences
This commit is contained in:
Родитель
8e7367a526
Коммит
655a8796d9
|
@ -2,7 +2,8 @@
|
|||
|
||||
import copy
|
||||
from collections import defaultdict
|
||||
from typing import DefaultDict, Dict, List
|
||||
from typing import Any, DefaultDict, Dict, List
|
||||
|
||||
import spacy
|
||||
from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan
|
||||
|
||||
|
@ -118,7 +119,19 @@ spacy_pre = SpacyPreProcessor(nlp)
|
|||
|
||||
|
||||
@operation("recon.v1.split_sentences", pre=[spacy_pre])
|
||||
def split_sentences(example, preprocessed_outputs = {}):
|
||||
def split_sentences(example: Example, preprocessed_outputs: Dict[str, Any] = {}) -> List[Example]:
|
||||
"""Split a single example into multiple examples by splitting the text into
|
||||
multiple sentences and resetting entity and token offsets based on offsets
|
||||
relative to sentence boundaries
|
||||
|
||||
Args:
|
||||
example (Example): Input Example
|
||||
preprocessed_outputs (Dict[str, Any], optional): Outputs of preprocessors.
|
||||
|
||||
Returns:
|
||||
List[Example]: List of split examples.
|
||||
Could be list of 1 if the example is just one sentence.
|
||||
"""
|
||||
doc = preprocessed_outputs["recon.v1.spacy"]
|
||||
|
||||
new_examples = []
|
||||
|
@ -127,7 +140,7 @@ def split_sentences(example, preprocessed_outputs = {}):
|
|||
span = doc.char_span(ent.start, ent.end, label=ent.label)
|
||||
if not span:
|
||||
token = None
|
||||
text = doc.text[ent.start:ent.end]
|
||||
text = doc.text[ent.start : ent.end]
|
||||
for t in doc:
|
||||
if t.text == text:
|
||||
token = t
|
||||
|
@ -141,20 +154,21 @@ def split_sentences(example, preprocessed_outputs = {}):
|
|||
sent_doc = sent.as_doc()
|
||||
new_example = Example(
|
||||
text=sent_doc.text,
|
||||
spans=[Span(
|
||||
text=e.text,
|
||||
start=e.start_char,
|
||||
end=e.end_char,
|
||||
token_start=e.start,
|
||||
token_end=e.end,
|
||||
label=e.label_
|
||||
) for e in sent_doc.ents],
|
||||
tokens=[Token(
|
||||
text=t.text,
|
||||
start=t.idx,
|
||||
end=t.idx + len(t.text),
|
||||
id=i
|
||||
) for i, t in enumerate(sent_doc)]
|
||||
spans=[
|
||||
Span(
|
||||
text=e.text,
|
||||
start=e.start_char,
|
||||
end=e.end_char,
|
||||
token_start=e.start,
|
||||
token_end=e.end,
|
||||
label=e.label_,
|
||||
)
|
||||
for e in sent_doc.ents
|
||||
],
|
||||
tokens=[
|
||||
Token(text=t.text, start=t.idx, end=t.idx + len(t.text), id=i)
|
||||
for i, t in enumerate(sent_doc)
|
||||
],
|
||||
)
|
||||
new_examples.append(new_example)
|
||||
return new_examples
|
||||
|
|
|
@ -66,7 +66,7 @@ class Dataset:
|
|||
data: List[Example] = [],
|
||||
operations: List[OperationState] = None,
|
||||
example_store: ExampleStore = None,
|
||||
verbose: bool = False
|
||||
verbose: bool = False,
|
||||
):
|
||||
self.name = name
|
||||
self.data = data
|
||||
|
|
|
@ -150,7 +150,9 @@ class Operation:
|
|||
dataset.example_store.add(new_example)
|
||||
|
||||
new_data = []
|
||||
for orig_example_hash, example, preprocessed_outputs in op_iter(dataset.data, self.pre, verbose=verbose):
|
||||
for orig_example_hash, example, preprocessed_outputs in op_iter(
|
||||
dataset.data, self.pre, verbose=verbose
|
||||
):
|
||||
if preprocessed_outputs:
|
||||
res = self.op(example, *args, preprocessed_outputs=preprocessed_outputs, **kwargs)
|
||||
else:
|
||||
|
|
Загрузка…
Ссылка в новой задаче