format and add type def + docstring for split_sentences

This commit is contained in:
Kabir Khan 2020-06-17 09:22:41 -07:00
Родитель 8e7367a526
Коммит 655a8796d9
3 изменённых файлов: 40 добавлений и 24 удалений

Просмотреть файл

@ -2,7 +2,8 @@
import copy import copy
from collections import defaultdict from collections import defaultdict
from typing import DefaultDict, Dict, List from typing import Any, DefaultDict, Dict, List
import spacy import spacy
from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan
@ -118,19 +119,31 @@ spacy_pre = SpacyPreProcessor(nlp)
@operation("recon.v1.split_sentences", pre=[spacy_pre]) @operation("recon.v1.split_sentences", pre=[spacy_pre])
def split_sentences(example, preprocessed_outputs = {}): def split_sentences(example: Example, preprocessed_outputs: Dict[str, Any] = {}) -> List[Example]:
"""Split a single example into multiple examples by splitting the text into
multiple sentences and resetting entity and token offsets based on offsets
relative to sentence boundaries
Args:
example (Example): Input Example
preprocessed_outputs (Dict[str, Any], optional): Outputs of preprocessors.
Returns:
List[Example]: List of split examples.
Could be list of 1 if the example is just one sentence.
"""
doc = preprocessed_outputs["recon.v1.spacy"] doc = preprocessed_outputs["recon.v1.spacy"]
new_examples = [] new_examples = []
ents = [] ents = []
for ent in example.spans: for ent in example.spans:
span = doc.char_span(ent.start, ent.end, label=ent.label) span = doc.char_span(ent.start, ent.end, label=ent.label)
if not span: if not span:
token = None token = None
text = doc.text[ent.start:ent.end] text = doc.text[ent.start : ent.end]
for t in doc: for t in doc:
if t.text == text: if t.text == text:
token = t token = t
if token: if token:
span = SpacySpan(doc, token.i, token.i + 1, label=ent.label) span = SpacySpan(doc, token.i, token.i + 1, label=ent.label)
ents.append(span) ents.append(span)
@ -141,20 +154,21 @@ def split_sentences(example, preprocessed_outputs = {}):
sent_doc = sent.as_doc() sent_doc = sent.as_doc()
new_example = Example( new_example = Example(
text=sent_doc.text, text=sent_doc.text,
spans=[Span( spans=[
text=e.text, Span(
start=e.start_char, text=e.text,
end=e.end_char, start=e.start_char,
token_start=e.start, end=e.end_char,
token_end=e.end, token_start=e.start,
label=e.label_ token_end=e.end,
) for e in sent_doc.ents], label=e.label_,
tokens=[Token( )
text=t.text, for e in sent_doc.ents
start=t.idx, ],
end=t.idx + len(t.text), tokens=[
id=i Token(text=t.text, start=t.idx, end=t.idx + len(t.text), id=i)
) for i, t in enumerate(sent_doc)] for i, t in enumerate(sent_doc)
],
) )
new_examples.append(new_example) new_examples.append(new_example)
return new_examples return new_examples

Просмотреть файл

@ -66,7 +66,7 @@ class Dataset:
data: List[Example] = [], data: List[Example] = [],
operations: List[OperationState] = None, operations: List[OperationState] = None,
example_store: ExampleStore = None, example_store: ExampleStore = None,
verbose: bool = False verbose: bool = False,
): ):
self.name = name self.name = name
self.data = data self.data = data
@ -151,10 +151,10 @@ class Dataset:
Args: Args:
operations (List[Union[str, OperationState]]): List of operations operations (List[Union[str, OperationState]]): List of operations
""" """
msg = Printer(no_print=self.verbose == False) msg = Printer(no_print=self.verbose == False)
msg.text(f"Applying pipeline of operations inplace to the dataset: {self.name}") msg.text(f"Applying pipeline of operations inplace to the dataset: {self.name}")
for op in operations: for op in operations:
op_name = op.name if isinstance(op, OperationState) else op op_name = op.name if isinstance(op, OperationState) else op
msg.text(f"|_ {op_name}") msg.text(f"|_ {op_name}")

Просмотреть файл

@ -116,7 +116,7 @@ class Operation:
""" """
verbose = True verbose = True
msg = Printer(no_print=verbose == False) msg = Printer(no_print=verbose == False)
initial_state = kwargs.pop("initial_state") if "initial_state" in kwargs else None initial_state = kwargs.pop("initial_state") if "initial_state" in kwargs else None
verbose = kwargs.pop("verbose") if "verbose" in kwargs else None verbose = kwargs.pop("verbose") if "verbose" in kwargs else None
if not initial_state: if not initial_state:
@ -150,7 +150,9 @@ class Operation:
dataset.example_store.add(new_example) dataset.example_store.add(new_example)
new_data = [] new_data = []
for orig_example_hash, example, preprocessed_outputs in op_iter(dataset.data, self.pre, verbose=verbose): for orig_example_hash, example, preprocessed_outputs in op_iter(
dataset.data, self.pre, verbose=verbose
):
if preprocessed_outputs: if preprocessed_outputs:
res = self.op(example, *args, preprocessed_outputs=preprocessed_outputs, **kwargs) res = self.op(example, *args, preprocessed_outputs=preprocessed_outputs, **kwargs)
else: else: