format and add type def + docstring for split_sentences

This commit is contained in:
Kabir Khan 2020-06-17 09:22:41 -07:00
Родитель 8e7367a526
Коммит 655a8796d9
3 изменённых файлов: 40 добавлений и 24 удалений

Просмотреть файл

@ -2,7 +2,8 @@
import copy
from collections import defaultdict
from typing import DefaultDict, Dict, List
from typing import Any, DefaultDict, Dict, List
import spacy
from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan
@ -118,19 +119,31 @@ spacy_pre = SpacyPreProcessor(nlp)
@operation("recon.v1.split_sentences", pre=[spacy_pre])
def split_sentences(example, preprocessed_outputs = {}):
def split_sentences(example: Example, preprocessed_outputs: Dict[str, Any] = {}) -> List[Example]:
"""Split a single example into multiple examples by splitting the text into
multiple sentences and resetting entity and token offsets based on offsets
relative to sentence boundaries
Args:
example (Example): Input Example
preprocessed_outputs (Dict[str, Any], optional): Outputs of preprocessors.
Returns:
List[Example]: List of split examples.
Could be list of 1 if the example is just one sentence.
"""
doc = preprocessed_outputs["recon.v1.spacy"]
new_examples = []
ents = []
for ent in example.spans:
span = doc.char_span(ent.start, ent.end, label=ent.label)
if not span:
token = None
text = doc.text[ent.start:ent.end]
text = doc.text[ent.start : ent.end]
for t in doc:
if t.text == text:
token = t
token = t
if token:
span = SpacySpan(doc, token.i, token.i + 1, label=ent.label)
ents.append(span)
@ -141,20 +154,21 @@ def split_sentences(example, preprocessed_outputs = {}):
sent_doc = sent.as_doc()
new_example = Example(
text=sent_doc.text,
spans=[Span(
text=e.text,
start=e.start_char,
end=e.end_char,
token_start=e.start,
token_end=e.end,
label=e.label_
) for e in sent_doc.ents],
tokens=[Token(
text=t.text,
start=t.idx,
end=t.idx + len(t.text),
id=i
) for i, t in enumerate(sent_doc)]
spans=[
Span(
text=e.text,
start=e.start_char,
end=e.end_char,
token_start=e.start,
token_end=e.end,
label=e.label_,
)
for e in sent_doc.ents
],
tokens=[
Token(text=t.text, start=t.idx, end=t.idx + len(t.text), id=i)
for i, t in enumerate(sent_doc)
],
)
new_examples.append(new_example)
return new_examples

Просмотреть файл

@ -66,7 +66,7 @@ class Dataset:
data: List[Example] = [],
operations: List[OperationState] = None,
example_store: ExampleStore = None,
verbose: bool = False
verbose: bool = False,
):
self.name = name
self.data = data
@ -151,10 +151,10 @@ class Dataset:
Args:
operations (List[Union[str, OperationState]]): List of operations
"""
msg = Printer(no_print=self.verbose == False)
msg.text(f"Applying pipeline of operations inplace to the dataset: {self.name}")
for op in operations:
op_name = op.name if isinstance(op, OperationState) else op
msg.text(f"|_ {op_name}")

Просмотреть файл

@ -116,7 +116,7 @@ class Operation:
"""
verbose = True
msg = Printer(no_print=verbose == False)
initial_state = kwargs.pop("initial_state") if "initial_state" in kwargs else None
verbose = kwargs.pop("verbose") if "verbose" in kwargs else None
if not initial_state:
@ -150,7 +150,9 @@ class Operation:
dataset.example_store.add(new_example)
new_data = []
for orig_example_hash, example, preprocessed_outputs in op_iter(dataset.data, self.pre, verbose=verbose):
for orig_example_hash, example, preprocessed_outputs in op_iter(
dataset.data, self.pre, verbose=verbose
):
if preprocessed_outputs:
res = self.op(example, *args, preprocessed_outputs=preprocessed_outputs, **kwargs)
else: