format and add type def + docstring for split_sentences

2020-06-17 09:22:41 -07:00 · 2020-06-17 09:22:41 -07:00 · 655a8796d9
--- a/recon/corrections.py
+++ b/recon/corrections.py
@ -2,7 +2,8 @@

 import copy
 from collections import defaultdict
-from typing import DefaultDict, Dict, List
+from typing import Any, DefaultDict, Dict, List
+
 import spacy
 from spacy.tokens import Doc as SpacyDoc, Span as SpacySpan

@ -118,7 +119,19 @@ spacy_pre = SpacyPreProcessor(nlp)


@operation("recon.v1.split_sentences", pre=[spacy_pre])
-def split_sentences(example, preprocessed_outputs = {}):    
+def split_sentences(example: Example, preprocessed_outputs: Dict[str, Any] = {}) -> List[Example]:
+    """Split a single example into multiple examples by splitting the text into 
+    multiple sentences and resetting entity and token offsets based on offsets 
+    relative to sentence boundaries
+
+    Args:
+        example (Example): Input Example
+        preprocessed_outputs (Dict[str, Any], optional): Outputs of preprocessors.
+
+    Returns:
+        List[Example]: List of split examples. 
+            Could be list of 1 if the example is just one sentence.
+    """
    doc = preprocessed_outputs["recon.v1.spacy"]

    new_examples = []
@ -127,7 +140,7 @@ def split_sentences(example, preprocessed_outputs = {}):
        span = doc.char_span(ent.start, ent.end, label=ent.label)
        if not span:
            token = None
-            text = doc.text[ent.start:ent.end]
+            text = doc.text[ent.start : ent.end]
            for t in doc:
                if t.text == text:
                    token = t
@ -141,20 +154,21 @@ def split_sentences(example, preprocessed_outputs = {}):
        sent_doc = sent.as_doc()
        new_example = Example(
            text=sent_doc.text,
-            spans=[Span(
-                text=e.text,
-                start=e.start_char,
-                end=e.end_char,
-                token_start=e.start,
-                token_end=e.end,
-                label=e.label_
-            ) for e in sent_doc.ents],
-            tokens=[Token(
-                text=t.text,
-                start=t.idx,
-                end=t.idx + len(t.text),
-                id=i
-            ) for i, t in enumerate(sent_doc)]
+            spans=[
+                Span(
+                    text=e.text,
+                    start=e.start_char,
+                    end=e.end_char,
+                    token_start=e.start,
+                    token_end=e.end,
+                    label=e.label_,
+                )
+                for e in sent_doc.ents
+            ],
+            tokens=[
+                Token(text=t.text, start=t.idx, end=t.idx + len(t.text), id=i)
+                for i, t in enumerate(sent_doc)
+            ],
        )
        new_examples.append(new_example)
    return new_examples
--- a/recon/dataset.py
+++ b/recon/dataset.py
@ -66,7 +66,7 @@ class Dataset:
        data: List[Example] = [],
        operations: List[OperationState] = None,
        example_store: ExampleStore = None,
-        verbose: bool = False
+        verbose: bool = False,
    ):
        self.name = name
        self.data = data
--- a/recon/operations.py
+++ b/recon/operations.py
@ -150,7 +150,9 @@ class Operation:
            dataset.example_store.add(new_example)

        new_data = []
-        for orig_example_hash, example, preprocessed_outputs in op_iter(dataset.data, self.pre, verbose=verbose):
+        for orig_example_hash, example, preprocessed_outputs in op_iter(
+            dataset.data, self.pre, verbose=verbose
+        ):
            if preprocessed_outputs:
                res = self.op(example, *args, preprocessed_outputs=preprocessed_outputs, **kwargs)
            else: