зеркало из https://github.com/microsoft/reconner.git
format and add type def + docstring for split_sentences
This commit is contained in:
Родитель
8e7367a526
Коммит
655a8796d9
|
@ -2,7 +2,8 @@
|
|||
|
||||
import copy
|
||||
from collections import defaultdict
|
||||
from typing import DefaultDict, Dict, List
|
||||
from typing import Any, DefaultDict, Dict, List
|
||||
|
||||
import spacy
|
||||
from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan
|
||||
|
||||
|
@ -118,19 +119,31 @@ spacy_pre = SpacyPreProcessor(nlp)
|
|||
|
||||
|
||||
@operation("recon.v1.split_sentences", pre=[spacy_pre])
|
||||
def split_sentences(example, preprocessed_outputs = {}):
|
||||
def split_sentences(example: Example, preprocessed_outputs: Dict[str, Any] = {}) -> List[Example]:
|
||||
"""Split a single example into multiple examples by splitting the text into
|
||||
multiple sentences and resetting entity and token offsets based on offsets
|
||||
relative to sentence boundaries
|
||||
|
||||
Args:
|
||||
example (Example): Input Example
|
||||
preprocessed_outputs (Dict[str, Any], optional): Outputs of preprocessors.
|
||||
|
||||
Returns:
|
||||
List[Example]: List of split examples.
|
||||
Could be list of 1 if the example is just one sentence.
|
||||
"""
|
||||
doc = preprocessed_outputs["recon.v1.spacy"]
|
||||
|
||||
|
||||
new_examples = []
|
||||
ents = []
|
||||
for ent in example.spans:
|
||||
span = doc.char_span(ent.start, ent.end, label=ent.label)
|
||||
if not span:
|
||||
token = None
|
||||
text = doc.text[ent.start:ent.end]
|
||||
text = doc.text[ent.start : ent.end]
|
||||
for t in doc:
|
||||
if t.text == text:
|
||||
token = t
|
||||
token = t
|
||||
if token:
|
||||
span = SpacySpan(doc, token.i, token.i + 1, label=ent.label)
|
||||
ents.append(span)
|
||||
|
@ -141,20 +154,21 @@ def split_sentences(example, preprocessed_outputs = {}):
|
|||
sent_doc = sent.as_doc()
|
||||
new_example = Example(
|
||||
text=sent_doc.text,
|
||||
spans=[Span(
|
||||
text=e.text,
|
||||
start=e.start_char,
|
||||
end=e.end_char,
|
||||
token_start=e.start,
|
||||
token_end=e.end,
|
||||
label=e.label_
|
||||
) for e in sent_doc.ents],
|
||||
tokens=[Token(
|
||||
text=t.text,
|
||||
start=t.idx,
|
||||
end=t.idx + len(t.text),
|
||||
id=i
|
||||
) for i, t in enumerate(sent_doc)]
|
||||
spans=[
|
||||
Span(
|
||||
text=e.text,
|
||||
start=e.start_char,
|
||||
end=e.end_char,
|
||||
token_start=e.start,
|
||||
token_end=e.end,
|
||||
label=e.label_,
|
||||
)
|
||||
for e in sent_doc.ents
|
||||
],
|
||||
tokens=[
|
||||
Token(text=t.text, start=t.idx, end=t.idx + len(t.text), id=i)
|
||||
for i, t in enumerate(sent_doc)
|
||||
],
|
||||
)
|
||||
new_examples.append(new_example)
|
||||
return new_examples
|
||||
|
|
|
@ -66,7 +66,7 @@ class Dataset:
|
|||
data: List[Example] = [],
|
||||
operations: List[OperationState] = None,
|
||||
example_store: ExampleStore = None,
|
||||
verbose: bool = False
|
||||
verbose: bool = False,
|
||||
):
|
||||
self.name = name
|
||||
self.data = data
|
||||
|
@ -151,10 +151,10 @@ class Dataset:
|
|||
Args:
|
||||
operations (List[Union[str, OperationState]]): List of operations
|
||||
"""
|
||||
|
||||
|
||||
msg = Printer(no_print=self.verbose == False)
|
||||
msg.text(f"Applying pipeline of operations inplace to the dataset: {self.name}")
|
||||
|
||||
|
||||
for op in operations:
|
||||
op_name = op.name if isinstance(op, OperationState) else op
|
||||
msg.text(f"|_ {op_name}")
|
||||
|
|
|
@ -116,7 +116,7 @@ class Operation:
|
|||
"""
|
||||
verbose = True
|
||||
msg = Printer(no_print=verbose == False)
|
||||
|
||||
|
||||
initial_state = kwargs.pop("initial_state") if "initial_state" in kwargs else None
|
||||
verbose = kwargs.pop("verbose") if "verbose" in kwargs else None
|
||||
if not initial_state:
|
||||
|
@ -150,7 +150,9 @@ class Operation:
|
|||
dataset.example_store.add(new_example)
|
||||
|
||||
new_data = []
|
||||
for orig_example_hash, example, preprocessed_outputs in op_iter(dataset.data, self.pre, verbose=verbose):
|
||||
for orig_example_hash, example, preprocessed_outputs in op_iter(
|
||||
dataset.data, self.pre, verbose=verbose
|
||||
):
|
||||
if preprocessed_outputs:
|
||||
res = self.op(example, *args, preprocessed_outputs=preprocessed_outputs, **kwargs)
|
||||
else:
|
||||
|
|
Загрузка…
Ссылка в новой задаче