зеркало из https://github.com/microsoft/reconner.git
format and add type def + docstring for split_sentences
This commit is contained in:
Родитель
8e7367a526
Коммит
655a8796d9
|
@ -2,7 +2,8 @@
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import DefaultDict, Dict, List
|
from typing import Any, DefaultDict, Dict, List
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan
|
from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan
|
||||||
|
|
||||||
|
@ -118,19 +119,31 @@ spacy_pre = SpacyPreProcessor(nlp)
|
||||||
|
|
||||||
|
|
||||||
@operation("recon.v1.split_sentences", pre=[spacy_pre])
|
@operation("recon.v1.split_sentences", pre=[spacy_pre])
|
||||||
def split_sentences(example, preprocessed_outputs = {}):
|
def split_sentences(example: Example, preprocessed_outputs: Dict[str, Any] = {}) -> List[Example]:
|
||||||
|
"""Split a single example into multiple examples by splitting the text into
|
||||||
|
multiple sentences and resetting entity and token offsets based on offsets
|
||||||
|
relative to sentence boundaries
|
||||||
|
|
||||||
|
Args:
|
||||||
|
example (Example): Input Example
|
||||||
|
preprocessed_outputs (Dict[str, Any], optional): Outputs of preprocessors.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Example]: List of split examples.
|
||||||
|
Could be list of 1 if the example is just one sentence.
|
||||||
|
"""
|
||||||
doc = preprocessed_outputs["recon.v1.spacy"]
|
doc = preprocessed_outputs["recon.v1.spacy"]
|
||||||
|
|
||||||
new_examples = []
|
new_examples = []
|
||||||
ents = []
|
ents = []
|
||||||
for ent in example.spans:
|
for ent in example.spans:
|
||||||
span = doc.char_span(ent.start, ent.end, label=ent.label)
|
span = doc.char_span(ent.start, ent.end, label=ent.label)
|
||||||
if not span:
|
if not span:
|
||||||
token = None
|
token = None
|
||||||
text = doc.text[ent.start:ent.end]
|
text = doc.text[ent.start : ent.end]
|
||||||
for t in doc:
|
for t in doc:
|
||||||
if t.text == text:
|
if t.text == text:
|
||||||
token = t
|
token = t
|
||||||
if token:
|
if token:
|
||||||
span = SpacySpan(doc, token.i, token.i + 1, label=ent.label)
|
span = SpacySpan(doc, token.i, token.i + 1, label=ent.label)
|
||||||
ents.append(span)
|
ents.append(span)
|
||||||
|
@ -141,20 +154,21 @@ def split_sentences(example, preprocessed_outputs = {}):
|
||||||
sent_doc = sent.as_doc()
|
sent_doc = sent.as_doc()
|
||||||
new_example = Example(
|
new_example = Example(
|
||||||
text=sent_doc.text,
|
text=sent_doc.text,
|
||||||
spans=[Span(
|
spans=[
|
||||||
text=e.text,
|
Span(
|
||||||
start=e.start_char,
|
text=e.text,
|
||||||
end=e.end_char,
|
start=e.start_char,
|
||||||
token_start=e.start,
|
end=e.end_char,
|
||||||
token_end=e.end,
|
token_start=e.start,
|
||||||
label=e.label_
|
token_end=e.end,
|
||||||
) for e in sent_doc.ents],
|
label=e.label_,
|
||||||
tokens=[Token(
|
)
|
||||||
text=t.text,
|
for e in sent_doc.ents
|
||||||
start=t.idx,
|
],
|
||||||
end=t.idx + len(t.text),
|
tokens=[
|
||||||
id=i
|
Token(text=t.text, start=t.idx, end=t.idx + len(t.text), id=i)
|
||||||
) for i, t in enumerate(sent_doc)]
|
for i, t in enumerate(sent_doc)
|
||||||
|
],
|
||||||
)
|
)
|
||||||
new_examples.append(new_example)
|
new_examples.append(new_example)
|
||||||
return new_examples
|
return new_examples
|
||||||
|
|
|
@ -66,7 +66,7 @@ class Dataset:
|
||||||
data: List[Example] = [],
|
data: List[Example] = [],
|
||||||
operations: List[OperationState] = None,
|
operations: List[OperationState] = None,
|
||||||
example_store: ExampleStore = None,
|
example_store: ExampleStore = None,
|
||||||
verbose: bool = False
|
verbose: bool = False,
|
||||||
):
|
):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.data = data
|
self.data = data
|
||||||
|
@ -151,10 +151,10 @@ class Dataset:
|
||||||
Args:
|
Args:
|
||||||
operations (List[Union[str, OperationState]]): List of operations
|
operations (List[Union[str, OperationState]]): List of operations
|
||||||
"""
|
"""
|
||||||
|
|
||||||
msg = Printer(no_print=self.verbose == False)
|
msg = Printer(no_print=self.verbose == False)
|
||||||
msg.text(f"Applying pipeline of operations inplace to the dataset: {self.name}")
|
msg.text(f"Applying pipeline of operations inplace to the dataset: {self.name}")
|
||||||
|
|
||||||
for op in operations:
|
for op in operations:
|
||||||
op_name = op.name if isinstance(op, OperationState) else op
|
op_name = op.name if isinstance(op, OperationState) else op
|
||||||
msg.text(f"|_ {op_name}")
|
msg.text(f"|_ {op_name}")
|
||||||
|
|
|
@ -116,7 +116,7 @@ class Operation:
|
||||||
"""
|
"""
|
||||||
verbose = True
|
verbose = True
|
||||||
msg = Printer(no_print=verbose == False)
|
msg = Printer(no_print=verbose == False)
|
||||||
|
|
||||||
initial_state = kwargs.pop("initial_state") if "initial_state" in kwargs else None
|
initial_state = kwargs.pop("initial_state") if "initial_state" in kwargs else None
|
||||||
verbose = kwargs.pop("verbose") if "verbose" in kwargs else None
|
verbose = kwargs.pop("verbose") if "verbose" in kwargs else None
|
||||||
if not initial_state:
|
if not initial_state:
|
||||||
|
@ -150,7 +150,9 @@ class Operation:
|
||||||
dataset.example_store.add(new_example)
|
dataset.example_store.add(new_example)
|
||||||
|
|
||||||
new_data = []
|
new_data = []
|
||||||
for orig_example_hash, example, preprocessed_outputs in op_iter(dataset.data, self.pre, verbose=verbose):
|
for orig_example_hash, example, preprocessed_outputs in op_iter(
|
||||||
|
dataset.data, self.pre, verbose=verbose
|
||||||
|
):
|
||||||
if preprocessed_outputs:
|
if preprocessed_outputs:
|
||||||
res = self.op(example, *args, preprocessed_outputs=preprocessed_outputs, **kwargs)
|
res = self.op(example, *args, preprocessed_outputs=preprocessed_outputs, **kwargs)
|
||||||
else:
|
else:
|
||||||
|
|
Загрузка…
Ссылка в новой задаче