Merge pull request #19 from microsoft/laserprec/sphinx_doc

Reformat docstrings to Google style and add Sphinx documentation
This commit is contained in:
Jianjie Liu 2021-02-01 17:46:58 -05:00 коммит произвёл GitHub
Родитель 5c4b634da9 f1bfe1f951
Коммит 2942d34267
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
23 изменённых файлов: 835 добавлений и 482 удалений

Просмотреть файл

@ -1,6 +1,6 @@
# Genalog - Synthetic Data Generator
[![Build Status](https://dev.azure.com/genalog-dev/genalog/_apis/build/status/Nightly-Build?branchName=main)](https://dev.azure.com/genalog-dev/genalog/_build/latest?definitionId=4&branchName=main) ![Azure DevOps tests (compact)](https://img.shields.io/azure-devops/tests/genalog-dev/genalog/4?compact_message) ![Azure DevOps coverage (main)](https://img.shields.io/azure-devops/coverage/genalog-dev/genalog/4/main) ![Python Versions](https://img.shields.io/badge/py-3.6%20%7C%203.7%20%7C%203.8%20-blue) ![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)
[![Build Status](https://dev.azure.com/genalog-dev/genalog/_apis/build/status/Nightly-Build?branchName=main)](https://dev.azure.com/genalog-dev/genalog/_build/latest?definitionId=4&branchName=main) ![Azure DevOps tests (compact)](https://img.shields.io/azure-devops/tests/genalog-dev/genalog/4?compact_message) ![Azure DevOps coverage (main)](https://img.shields.io/azure-devops/coverage/genalog-dev/genalog/4/main) ![Python Versions](https://img.shields.io/badge/py-3.6%20%7C%203.7%20%7C%203.8%20-blue) ![Supported OSs](https://img.shields.io/badge/platform-%20linux--64%20-red) ![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)
Genalog is an open source, cross-platform python package allowing to generate synthetic document images with text data. Tool also allows you to add various text degradations to these images. The purpose of this tool is to provide a fast and efficient way to generate synthetic documents from text data by leveraging layout from templates that you create in simple HTML format.

3
docs/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,3 @@
_build/
_static/
_templates/

20
docs/Makefile Normal file
Просмотреть файл

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

67
docs/conf.py Normal file
Просмотреть файл

@ -0,0 +1,67 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
import os
import sys
sys.path.insert(0, os.path.abspath('.'))
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../genalog'))
sys.path.insert(0, os.path.abspath('../genalog/degradation'))
# -- Project information -----------------------------------------------------
project = 'genalog'
copyright = '2021, Microsoft'
author = 'Microsoft'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.napoleon',
'sphinx.ext.coverage',
]
# The master toctree document.
master_doc = 'index'
autodoc_member_order = 'groupwise'
autoclass_content = 'both'
# Napoleon settings
napoleon_google_docstring = True
napoleon_numpy_docstring = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

Просмотреть файл

@ -0,0 +1,29 @@
genalog.degradation package
===========================
Submodules
----------
genalog.degradation.degrader module
-----------------------------------
.. automodule:: genalog.degradation.degrader
:members:
:undoc-members:
:show-inheritance:
genalog.degradation.effect module
---------------------------------
.. automodule:: genalog.degradation.effect
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: genalog.degradation
:members:
:undoc-members:
:show-inheritance:

Просмотреть файл

@ -0,0 +1,29 @@
genalog.generation package
==========================
Submodules
----------
genalog.generation.content module
---------------------------------
.. automodule:: genalog.generation.content
:members:
:undoc-members:
:show-inheritance:
genalog.generation.document module
----------------------------------
.. automodule:: genalog.generation.document
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: genalog.generation
:members:
:undoc-members:
:show-inheritance:

Просмотреть файл

@ -0,0 +1,53 @@
genalog.ocr package
===================
Submodules
----------
genalog.ocr.blob\_client module
-------------------------------
.. automodule:: genalog.ocr.blob_client
:members:
:undoc-members:
:show-inheritance:
genalog.ocr.common module
-------------------------
.. automodule:: genalog.ocr.common
:members:
:undoc-members:
:show-inheritance:
genalog.ocr.grok module
-----------------------
.. automodule:: genalog.ocr.grok
:members:
:undoc-members:
:show-inheritance:
genalog.ocr.metrics module
--------------------------
.. automodule:: genalog.ocr.metrics
:members:
:undoc-members:
:show-inheritance:
genalog.ocr.rest\_client module
-------------------------------
.. automodule:: genalog.ocr.rest_client
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: genalog.ocr
:members:
:undoc-members:
:show-inheritance:

32
docs/genalog/genalog.rst Normal file
Просмотреть файл

@ -0,0 +1,32 @@
genalog package
===============
Subpackages
-----------
.. toctree::
:maxdepth: 4
genalog.degradation
genalog.generation
genalog.ocr
genalog.text
Submodules
----------
genalog.pipeline module
-----------------------
.. automodule:: genalog.pipeline
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: genalog
:members:
:undoc-members:
:show-inheritance:

Просмотреть файл

@ -0,0 +1,69 @@
genalog.text package
====================
Submodules
----------
genalog.text.alignment module
-----------------------------
.. automodule:: genalog.text.alignment
:members:
:undoc-members:
:show-inheritance:
genalog.text.anchor module
--------------------------
.. automodule:: genalog.text.anchor
:members:
:undoc-members:
:show-inheritance:
genalog.text.conll\_format module
---------------------------------
.. automodule:: genalog.text.conll_format
:members:
:undoc-members:
:show-inheritance:
genalog.text.lcs module
-----------------------
.. automodule:: genalog.text.lcs
:members:
:undoc-members:
:show-inheritance:
genalog.text.ner\_label module
------------------------------
.. automodule:: genalog.text.ner_label
:members:
:undoc-members:
:show-inheritance:
genalog.text.preprocess module
------------------------------
.. automodule:: genalog.text.preprocess
:members:
:undoc-members:
:show-inheritance:
genalog.text.splitter module
----------------------------
.. automodule:: genalog.text.splitter
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: genalog.text
:members:
:undoc-members:
:show-inheritance:

22
docs/index.rst Normal file
Просмотреть файл

@ -0,0 +1,22 @@
.. genalog documentation master file, created by
sphinx-quickstart on Thu Jan 28 15:19:33 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to genalog's documentation!
===================================
.. toctree::
:maxdepth: 2
:caption: Contents:
genalog/genalog
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

35
docs/make.bat Normal file
Просмотреть файл

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

Просмотреть файл

@ -0,0 +1,2 @@
sphinx
sphinx_rtd_theme

Просмотреть файл

@ -16,29 +16,27 @@ class Degrader:
""" An object for applying multiple degradation effects onto an image"""
def __init__(self, effects):
"""Initialize a Degrader object
"""
Arguments:
effects {list} -- a list of 2-element tuple that defines:
effects (list) : a list of 2-element tuple (method_name, method_kwargs) where:
(method_name, method_kwargs)
:method_name: the name of the degradation method (method must be defined in 'genalog.degradation.effect')
:method_kwargs: the keyword arguments of the corresponding method
1. method_name: the name of the degradation method
(method must be defined in 'genalog.degradation.effect')
2. method_kwargs: the keyword arguments of the corresponding method
Example:
Example:
::
[
("blur", {"radius": 3}),
("bleed_through", {"alpha": 0.8),
("morphology", {"operation": "open", "kernel_shape": (3,3), "kernel_type": "ones"}),
("morphology", {"operation": "open", "kernel_shape": (3,3), "kernel_type": "ones"})
]
The example above will apply degradation effects to the images
in the following sequence:
The example above will apply degradation effects to the images
in the following sequence:
::
blur -> bleed_through -> morphological operation (open)
"blur" -> "bleed_through" -> "morphological operation (open)"
"""
Degrader.validate_effects(effects)
self.effects_to_apply = copy.deepcopy(effects)
@ -49,15 +47,15 @@ class Degrader:
"""Validate the effects list
Arguments:
effects {list} -- a list of 2-element tuple that defines:
effects (list) : a list of 2-element tuple ``(method_name, method_kwargs)``
that defines:
(method_name, method_kwargs)
1. ``method_name`` : the name of the degradation method \
(method must be defined in ``genalog.degradation.effect``)
2. ``method_kwargs`` : the keyword arguments of the corresponding method
1. method_name: the name of the degradation method
(method must be defined in 'genalog.degradation.effect')
2. method_kwargs: the keyword arguments of the corresponding method
Example:
Example:
::
[
("blur", {"radius": "3"}),
@ -66,10 +64,9 @@ class Degrader:
]
Raises:
ValueError: raise this error when
1. method_name not defined in "genalog.degradation.effect"
2. method_kwargs is not a valid keyword arguments in the
corresponding method
ValueError: raise this error when:
``method_name`` not defined in "genalog.degradation.effect"
``method_kwargs`` is not a valid keyword arguments in the corresponding method
"""
for effect_tuple in effects:
method_name, method_kwargs = effect_tuple
@ -109,7 +106,7 @@ class Degrader:
"""Apply degradation effects in sequence
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
src (numpy.ndarray) : source image of shape (rows, cols)
Returns:
a copy of the source image {numpy.ndarray} after apply the effects
@ -132,7 +129,7 @@ class Degrader:
the keyword argument dictionary
Arguments:
kwargs {dict} -- keyword argument dictionary
kwargs (dict) : keyword argument dictionary
Ex: {"src": ImageState.ORIGINAL_STATE, "radius": 5}

Просмотреть файл

@ -8,14 +8,12 @@ def blur(src, radius=5):
"""Wrapper function for cv2.GaussianBlur
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
Keyword Arguments:
radius {int} -- size of the square kernel,
MUST be an odd integer (default: {5})
src (numpy.ndarray) : source image of shape (rows, cols)
radius (int, optional) : size of the square kernel, MUST be an odd integer.
Defaults to 5.
Returns:
a copy of the source image {numpy.ndarray} after apply the effect
numpy.ndarray: a copy of the source image after apply the effect
"""
return cv2.GaussianBlur(src, (radius, radius), cv2.BORDER_DEFAULT)
@ -26,16 +24,14 @@ def overlay_weighted(src, background, alpha, beta, gamma=0):
dst[i] = alpha*src[i] + beta*background[i] + gamma
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
background {numpy.ndarray} -- background image. Must be in same shape are `src`
alpha {float} -- transparent factor for the foreground
beta {float} -- transparent factor for the background
Keyword Arguments:
gamma {int} -- luminance constant (default: {0})
src (numpy.ndarray) : source image of shape (rows, cols)
background (numpy.ndarray) : background image. Must be in same shape are `src`
alpha (float) : transparent factor for the foreground
beta (float) : transparent factor for the background
gamma (int, optional) : luminance constant. Defaults to 0.
Returns:
a copy of the source image {numpy.ndarray} after apply the effect
numpy.ndarray: a copy of the source image after apply the effect
"""
return cv2.addWeighted(src, alpha, background, beta, gamma).astype(np.uint8)
@ -46,11 +42,11 @@ def overlay(src, background):
dst[i] = src[i] & background[i]
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
background {numpy.ndarray} -- background image. Must be in same shape are `src`
src (numpy.ndarray) : source image of shape (rows, cols)
background (numpy.ndarray) : background image. Must be in same shape are `src`
Returns:
a copy of the source image {numpy.ndarray} after apply the effect
numpy.ndarray: a copy of the source image after apply the effect
"""
return cv2.bitwise_and(src, background).astype(np.uint8)
@ -59,14 +55,14 @@ def translation(src, offset_x, offset_y):
"""Shift the image in x, y direction
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
offset_x {int} -- pixels in the x direction.
src (numpy.ndarray) : source image of shape (rows, cols)
offset_x (int) : pixels in the x direction.
Positive value shifts right and negative shifts right.
offset_y {int} -- pixels in the y direction.
offset_y (int) : pixels in the y direction.
Positive value shifts down and negative shifts up.
Returns:
a copy of the source image {numpy.ndarray} after apply the effect
numpy.ndarray: a copy of the source image after apply the effect
"""
rows, cols = src.shape
trans_matrix = np.float32([[1, 0, offset_x], [0, 1, offset_y]])
@ -79,21 +75,18 @@ def bleed_through(src, background=None, alpha=0.8, gamma=0, offset_x=0, offset_y
"""Apply bleed through effect, background is flipped horizontally.
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
Keyword Arguments:
background {numpy.ndarray} -- background image. Must be in same
shape as foreground (default: {None})
alpha {float} -- transparent factor for the foreground (default: {0.8})
gamma {int} -- luminance constant (default: {0})
offset_x {int} -- background translation offset (default: {0})
Positive value shifts right and negative shifts right.
offset_y {int} -- background translation offset (default: {5})
Positive value shifts down and negative shifts up.
src (numpy.ndarray) : source image of shape (rows, cols)
background (numpy.ndarray, optional) : background image. Must be in same
shape as foreground. Defaults to None.
alpha (float, optional) : transparent factor for the foreground. Defaults to 0.8.
gamma (int, optional) : luminance constant. Defaults to 0.
offset_x (int, optional) : background translation offset. Defaults to 0.
Positive value shifts right and negative shifts right.
offset_y (int, optional) : background translation offset. Defaults to 5.
Positive value shifts down and negative shifts up.
Returns:
a copy of the source image {numpy.ndarray} after apply the effect.
Pixel value ranges [0, 255]
numpy.ndarray: a copy of the source image after apply the effect. Pixel value ranges [0, 255]
"""
if background is None:
background = src.copy()
@ -109,14 +102,12 @@ def pepper(src, amount=0.05):
See https://scikit-image.org/docs/stable/api/skimage.util.html#random-noise
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
Keyword Arguments:
amount {float} -- proportion of pixels in range [0, 1] to apply the effect
(default: {0.05})
src (numpy.ndarray) : source image of shape (rows, cols)
amount (float, optional) : proportion of pixels in range [0, 1] to apply the effect.
Defaults to 0.05.
Returns:
a copy of the source image {numpy.ndarray} after apply the effect.
numpy.ndarray: a copy of the source image after apply the effect.
Pixel value ranges [0, 255] as uint8.
"""
dst = src.copy()
@ -132,14 +123,12 @@ def salt(src, amount=0.3):
See https://scikit-image.org/docs/stable/api/skimage.util.html#random-noise
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
Keyword Arguments:
amount {float} -- proportion of pixels in range [0, 1] to apply the effect
(default: {0.05})
src (numpy.ndarray) : source image of shape (rows, cols)
amount (float, optional) : proportion of pixels in range [0, 1] to apply the effect.
Defaults to 0.05.
Returns:
a copy of the source image {numpy.ndarray} after apply the effect.
numpy.ndarray: a copy of the source image after apply the effect.
Pixel value ranges [0, 255]
"""
dst = src.copy()
@ -153,16 +142,16 @@ def salt_then_pepper(src, salt_amount=0.1, pepper_amount=0.05):
"""Randomly add salt then add pepper onto the image.
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
salt_amount {float} -- proportion of pixels in range [0, 1] to
apply the salt effect
(default: {0.1})
pepper_amount {float} -- proportion of pixels in range [0, 1] to
apply the pepper effect
(default: {0.05})
src (numpy.ndarray) : source image of shape (rows, cols)
salt_amount (float) : proportion of pixels in range [0, 1] to
apply the salt effect.
Defaults to 0.1.
pepper_amount (float) : proportion of pixels in range [0, 1] to
apply the pepper effect.
Defaults to 0.05.
Returns:
a copy of the source image {numpy.ndarray} after apply the effect.
numpy.ndarray: a copy of the source image after apply the effect.
Pixel value ranges [0, 255] as uint8.
"""
salted = salt(src, amount=salt_amount)
@ -173,16 +162,16 @@ def pepper_then_salt(src, pepper_amount=0.05, salt_amount=0.1):
"""Randomly add pepper then salt onto the image.
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
pepper_amount {float} -- proportion of pixels in range [0, 1] to
apply the pepper effect.
(default: {0.05})
salt_amount {float} -- proportion of pixels in range [0, 1] to
apply the salt effect.
(default: {0.1})
src (numpy.ndarray) : source image of shape (rows, cols)
pepper_amount (float) : proportion of pixels in range [0, 1] to
apply the pepper effect.
Defaults to 0.05.
salt_amount (float) : proportion of pixels in range [0, 1] to
apply the salt effect.
Defaults to 0.1.
Returns:
a copy of the source image {numpy.ndarray} after apply the effect.
numpy.ndarray: a copy of the source image after apply the effect.
Pixel value ranges [0, 255] as uint8.
"""
peppered = pepper(src, amount=pepper_amount)
@ -193,44 +182,45 @@ def create_2D_kernel(kernel_shape, kernel_type="ones"):
"""Create 2D kernel for morphological operations.
Arguments:
kernel_shape {tuple} -- shape of the kernel (rows, cols)
kernel_shape (tuple) : shape of the kernel (rows, cols)
kernel_type (str, optional) : type of kernel. Defaults to "ones".
::
Keyword Arguments:
kernel_type {str} -- type of kernel (default: {"ones"}).
All supported kernel types are below:
All supported kernel types are below:
"ones": kernel is filled with all 1s in shape (rows, cols)
[[1,1,1],
[1,1,1],
[1,1,1]]
"upper_triangle": upper triangular matrix filled with ones
[[1,1,1],
[0,1,1],
[0,0,1]]
"lower_triangle": lower triangular matrix filled with ones
[[1,0,0],
[1,1,0],
[1,1,1]]
"x": "X" shape cross
[[1,0,1],
[0,1,0],
[1,0,1]]
"plus": "+" shape cross
[[0,1,0],
[1,1,1],
[0,1,0]]
"ellipse": elliptical kernel
[[0, 0, 1, 0, 0],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[0, 0, 1, 0, 0]]
"ones": kernel is filled with all 1s in shape (rows, cols)
[[1,1,1],
[1,1,1],
[1,1,1]]
"upper_triangle": upper triangular matrix filled with ones
[[1,1,1],
[0,1,1],
[0,0,1]]
"lower_triangle": lower triangular matrix filled with ones
[[1,0,0],
[1,1,0],
[1,1,1]]
"x": "X" shape cross
[[1,0,1],
[0,1,0],
[1,0,1]]
"plus": "+" shape cross
[[0,1,0],
[1,1,1],
[0,1,0]]
"ellipse": elliptical kernel
[[0, 0, 1, 0, 0],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[0, 0, 1, 0, 0]]
Raises:
ValueError: if kernel is not a 2-element tuple or
kernel_type is not one of the supported values
Returns:
a 2D array {numpy.ndarray} of shape `kernel_shape`.
numpy.ndarray: a 2D array of shape `kernel_shape`.
"""
if len(kernel_shape) != 2:
raise ValueError("Kernel shape must be a tuple of 2 integers")
@ -274,21 +264,18 @@ def morphology(src, operation="open", kernel_shape=(3, 3), kernel_type="ones"):
("open", "close", "dilate" and "erode") with the given parameters
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
Keyword Arguments:
operation {str} -- name of a morphological operation:
("open", "close", "dilate", "erode")
(default: {"open"})
kernel_shape {tuple} -- shape of the kernel (rows, cols)
(default: {(3,3)})
kernel_type {str} -- type of kernel (default: {"ones"})
Supported kernel_types are:
["ones", "upper_triangle", "lower_triangle",
"x", "plus", "ellipse"]
src (numpy.ndarray) : source image of shape (rows, cols)
operation (str, optional) : name of a morphological operation:
``("open", "close", "dilate", "erode")``
Defaults to ``"open"``.
kernel_shape (tuple, optional) : shape of the kernel (rows, cols).
Defaults to (3,3).
kernel_type (str, optional) : type of kernel.
``("ones", "upper_triangle", "lower_triangle", "x", "plus", "ellipse")``
Defaults to ``"ones"``.
Returns:
a copy of the source image {numpy.ndarray} after apply the effect.
numpy.ndarray: a copy of the source image after apply the effect.
"""
kernel = create_2D_kernel(kernel_shape, kernel_type)
if operation == "open":
@ -311,15 +298,16 @@ def open(src, kernel):
foreground pixels (white pixels), however it is less destructive than erosion.
For more information see:
1. https://docs.opencv.org/master/d9/d61/tutorial_py_morphological_ops.html
2. http://homepages.inf.ed.ac.uk/rbf/HIPR2/open.htm
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
kernel {numpy.ndarray} -- a 2D array for structuring the morphological effect
src (numpy.ndarray) : source image of shape (rows, cols)
kernel (numpy.ndarray) : a 2D array for structuring the morphological effect
Returns:
a copy of the source image {numpy.ndarray} after apply the effect
numpy.ndarray: a copy of the source image after apply the effect.
"""
return cv2.morphologyEx(src, cv2.MORPH_OPEN, kernel)
@ -330,46 +318,51 @@ def close(src, kernel):
dilation of the original boundary shape.
For more information see:
1. https://docs.opencv.org/master/d9/d61/tutorial_py_morphological_ops.html
2. http://homepages.inf.ed.ac.uk/rbf/HIPR2/close.htm
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
kernel {numpy.ndarray} -- a 2D array for structuring the morphological effect
src (numpy.ndarray) : source image of shape (rows, cols)
kernel (numpy.ndarray) : a 2D array for structuring the morphological effect
Returns:
a copy of the source image {numpy.ndarray} after apply the effect
numpy.ndarray: a copy of the source image after apply the effect.
"""
return cv2.morphologyEx(src, cv2.MORPH_CLOSE, kernel)
def erode(src, kernel):
""" "erode" morphological operation. Erodes foreground pixels (white pixels).
For more information see:
1. https://docs.opencv.org/master/d9/d61/tutorial_py_morphological_ops.html
2. http://homepages.inf.ed.ac.uk/rbf/HIPR2/erode.htm
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
kernel {numpy.ndarray} -- a 2D array for structuring the morphological effect
src (numpy.ndarray) : source image of shape (rows, cols)
kernel (numpy.ndarray) : a 2D array for structuring the morphological effect
Returns:
a copy of the source image {numpy.ndarray} after apply the effect
numpy.ndarray: a copy of the source image after apply the effect.
"""
return cv2.erode(src, kernel)
def dilate(src, kernel):
""" "dilate" morphological operation. Grows foreground pixels (white pixels).
For more information see:
1. https://docs.opencv.org/master/d9/d61/tutorial_py_morphological_ops.html
2. http://homepages.inf.ed.ac.uk/rbf/HIPR2/dilate.htm
Arguments:
src {numpy.ndarray} -- source image of shape (rows, cols)
kernel {numpy.ndarray} -- a 2D array for structuring the morphological effect
src (numpy.ndarray) : source image of shape (rows, cols)
kernel (numpy.ndarray) : a 2D array for structuring the morphological effect
Returns:
a copy of the source image {numpy.ndarray} after apply the effect
numpy.ndarray: a copy of the source image after apply the effect.
"""
return cv2.dilate(src, kernel)

Просмотреть файл

@ -33,27 +33,30 @@ class Document(object):
"""Initialize a Document object with source template and content
Arguments:
content {CompositeContent} -- a iterable object whose elements
template {Template} -- a jinja2.Template object
content (CompositeContent) : a iterable object whose elements
template (Template) : a jinja2.Template object
Optional Argument:
styles [dict] -- a kwargs dictionary (context) whose keys and values are
the template variable and their respective values
Other Parameters:
styles (dict) : a kwargs dictionary (context) whose keys and values are
the template variable and their respective values
Example:
{
"font_family": "Calibri",
"font_size": "10px",
"hyphenate": True,
}
Example:
::
Note that this assumes that "font_family", "font_size", "hyphenate" are valid
variables declared in the loaded template. There will be **NO SIDE-EFFECT**
providing an variable undefined in the template.
{
"font_family": "Calibri",
"font_size": "10px",
"hyphenate": True,
}
You can also provide these key-value pairs via Python keyword arguments:
**NOTE** that this assumes that "font_family", "font_size", "hyphenate" are valid
variables declared in the loaded template. There will be **NO SIDE-EFFECT**
providing an variable undefined in the template.
Document(content, template, font_family="Calibri, font_size="10px", hyphenate=True)
You can also provide these key-value pairs via Python keyword arguments:
::
Document(content, template, font_family="Calibri, font_size="10px", hyphenate=True)
"""
self.content = content
self.template = template
@ -72,7 +75,7 @@ class Document(object):
This method will be used mostly for testing purpose.
Returns:
[str] -- compiled Html template in unicode string
str : compiled Html template in unicode string
"""
return self.template.render(content=self.content, **self.styles)
@ -81,10 +84,10 @@ class Document(object):
Arguments:
target -- a filename, file-like object, or None
split_pages {bool} -- true if saving each document page as a separate file.
zoom {int} -- the zoom factor in PDF units per CSS units.
split_pages (bool) : true if saving each document page as a separate file.
zoom (int) : the zoom factor in PDF units per CSS units.
split_pages {bool} -- true if save each document page as a separate file.
split_pages (bool) : true if save each document page as a separate file.
Returns:
The PDF as bytes if target is not provided or None, otherwise None (the PDF is written to target)
@ -96,8 +99,8 @@ class Document(object):
Arguments:
target -- a filename, file-like object, or None
split_pages {bool} -- true if save each document page as a separate file.
resolution {int} -- the output resolution in PNG pixels per CSS inch. At 300 dpi (the default),
split_pages (bool) : true if save each document page as a separate file.
resolution (int) : the output resolution in PNG pixels per CSS inch. At 300 dpi (the default),
PNG pixels match the CSS px unit.
Returns:
@ -124,15 +127,16 @@ class Document(object):
def render_array(self, resolution=300, channel="GRAYSCALE"):
"""Render document as a numpy.ndarray.
Keyword Arguments:
resolution {int} -- in units dpi (default: {300})
channel {str} -- abbreviation for color channels (default: {"GRAYSCALE"})
available values are: "GRAYSCALE", "RGB", "RGBA", "BGRA", "BGR"
Arguments:
resolution (int, optional) : in units dpi. Defaults to 300.
channel (str, optional): abbreviation for color channels. Available
values are: ``"GRAYSCALE", "RGB", "RGBA", "BGRA", "BGR"``
Defaults to ``"GRAYSCALE"``.
Note that "RGB" is 3-channel, "RGBA" is 4-channel and "GRAYSCALE" is single channel
**NOTE**: that ``"RGB"`` is 3-channel, ``"RGBA"`` is 4-channel and ``"GRAYSCALE"`` is single channel
Returns:
A numpy.ndarray representation of the document.
numpy.ndarray: representation of the document.
"""
# Method below returns a cairocffi.ImageSurface object
# https://cairocffi.readthedocs.io/en/latest/api.html#cairocffi.ImageSurface
@ -172,16 +176,18 @@ class Document(object):
def update_style(self, **style):
"""Update template variables that controls the document style and re-compile the document to reflect the style change.
Optional Arguments:
style {dict} -- a kwargs dictionary whose keys and values are
the template variable and their respective values
Other Parameters:
style (dict) : a kwargs dictionary whose keys and values are
the template variable and their respective values
Example:
{
"font_family": "Calibri",
"font_size": "10px",
"hyphenate": True
}
Example:
::
{
"font_family": "Calibri",
"font_size": "10px",
"hyphenate": True
}
"""
self.styles.update(style)
@ -198,10 +204,11 @@ class DocumentGenerator:
def __init__(self, template_path=None):
"""Initialize a DocumentGenerator class
Keyword Arguments:
template_path {str} -- filepath of custom templates (default: {None})
*** Important *** if not set, will use the default templates from the
package "genalog.generation.templates".
Arguments:
template_path (str, optionsl) : filepath of custom templates. Defaults to None.
**NOTE**: if not set, will use the default templates from the
package "genalog.generation.templates".
"""
if template_path:
self.template_env = Environment(
@ -228,10 +235,10 @@ class DocumentGenerator:
This function filters out non-html templates and base templates
Arguments:
template_name {str} -- target of the template
template_name (str) : target of the template
Returns:
[bool] -- True if keeping the template in the list. False otherwise.
bool : True if keeping the template in the list. False otherwise.
"""
TEMPLATES_TO_REMOVE = [".css", "base.html.jinja", "macro"]
if any(name in template_name for name in TEMPLATES_TO_REMOVE):
@ -243,30 +250,32 @@ class DocumentGenerator:
Set new styles to generate.
Arguments:
style_combination {dict} -- a dictionary {str: list} enlisting the combinations
of values to generate per style property
(default: {None})
style_combination (dict) : a dictionary {str: list} enlisting the combinations
of values to generate per style property. Defaults to None.
Example:
{
"font_family": ["Calibri", "Times"],
"font_size": ["10px", "12px"],
"hyphenate": [True],
}
Example:
::
will produce documents with the following combinations of styles:
{
"font_family": ["Calibri", "Times"],
"font_size": ["10px", "12px"],
"hyphenate": [True],
}
("Calibri", "10px", True)
("Times" , "10px", True)
("Calibri", "12px", True)
("Times" , "12px", True)
will produce documents with the following combinations of styles:
::
Note that this assumes that "font_family", "font_size", "hyphenate" are valid
variables declared in the loaded template. There will be NO side-effect providing
an variable UNDEFINED in the template.
("Calibri", "10px", True)
("Times" , "10px", True)
("Calibri", "12px", True)
("Times" , "12px", True)
If this parameter is not provided, generator will use default document
styles: DEFAULT_STYLE_COMBINATION
**NOTE** that this assumes that ``font_family``, ``font_size``, ``hyphenate`` are valid
variables declared in the loaded template. There will be NO side-effect providing
an variable UNDEFINED in the template.
If this parameter is not provided, generator will use default document
styles: ``DEFAULT_STYLE_COMBINATION``.
"""
self.styles_to_generate = DocumentGenerator.expand_style_combinations(
style_combinations
@ -276,12 +285,12 @@ class DocumentGenerator:
"""Create a Document generator
Arguments:
content {list} -- a list [str] of string to populate the template
templates_to_render {list} -- a list [str] or templates to render
content (list) : a list [str] of string to populate the template
templates_to_render (list) : a list [str] or templates to render
These templates must be located in the self.template_env
Yields:
[Document] -- a Document Object
Document : a Document Object
"""
for template_name in templates_to_render:
if template_name not in self.template_list:
@ -296,34 +305,39 @@ class DocumentGenerator:
def expand_style_combinations(styles):
"""Expand the list of style values into all possible style combinations
Example:
styles =
{
"font_family": ["Calibri", "Times"],
"font_size": ["10px", "12px"],
"hyphenate": [True],
}
this method will return:
[
{"font_family": "Calibri", "font_size": "10px", "hyphenate":True }
{"font_family": "Times", "font_size": "10px", "hyphenate":True }
{"font_family": "Calibri", "font_size": "12px", "hyphenate":True }
{"font_family": "Times", "font_size": "12px", "hyphenate":True }
]
The result dictionaries are intended to be used as a kwargs to initialize a
Document Object:
Example:
Document(template, content, **{"font_family": "Calibri", "font_size": ...})
Arguments:
styles {dict} -- a dictionary {str: list} enlisting the combinations of values
to generate per style property
styles (dict) : a dictionary {str: list} enlisting the combinations of values
to generate per style property
Return:
list -- a list of dictionaries
list : a list of dictionaries
Example:
::
styles =
{
"font_family": ["Calibri", "Times"],
"font_size": ["10px", "12px"],
"hyphenate": [True],
}
This method will return:
::
[
{"font_family": "Calibri", "font_size": "10px", "hyphenate":True }
{"font_family": "Times", "font_size": "10px", "hyphenate":True }
{"font_family": "Calibri", "font_size": "12px", "hyphenate":True }
{"font_family": "Times", "font_size": "12px", "hyphenate":True }
]
The result dictionaries are intended to be used as a kwargs to initialize a
``Document`` object:
::
Document(template, content, **{"font_family": "Calibri", "font_size": ...})
"""
# return empty list if input is empty
if not styles:

Просмотреть файл

@ -6,13 +6,13 @@ OCR Metrics
Accuracy = Correct Words/Total Words (in target strings)
2. Count of edit distance ops:
insert, delete, substitutions; like in the paper "Deep Statistical Analysis of OCR Errors for Effective Post-OCR Processing".
This is based on Levenshtein edit distance.
insert, delete, substitutions; like in the paper "Deep Statistical Analysis of OCR Errors for Effective Post-OCR Processing".
This is based on Levenshtein edit distance.
3. By looking at the gaps in alignment we also generate substitution dicts:
e.g: if we have text "a worn coat" and ocr is "a wom coat" , "rn" -> "m" will be captured as a substitution
since the rest of the segments align.The assumption here is that we do not expect to have very long gaps in alignment,
hence collecting and counting these substitutions will be managable.
hence collecting and counting these substitutions will be managable.
"""
import argparse

Просмотреть файл

@ -40,25 +40,23 @@ class AnalogDocumentGeneration(object):
return self.doc_generator.template_list
# Fix: rename to generate_sample()
# Add another method called generate_all_styles()
# TODO: dd another method called generate_all_styles()
def generate_img(self, full_text_path, template, target_folder=None):
"""Generate a image with a sample style given a text document
NOTE: This does not generate all possible style combinations.
Use generate_all_styles() instead.
**NOTE**: This does not generate all possible style combinations.
Arguments:
full_text_path {str} -- full filepath of a text document (i.e /dataset/doc.txt)
template {str} -- name of html template to generate document from
Ex: "text_block.html.jinja"
Keyword Arguments:
target_folder {str} -- folder path in which the generated images are stored
(default: {None})
resolution {int} -- resolution in dpi (default: {300})
full_text_path (str) : full filepath of a text document (ex: "/dataset/doc.txt").
template (str) : name of html template to generate document from. (ex: "text_block.html.jinja")
target_folder (str, optional) : folder path in which the generated images are stored. Defaults to None.
resolution (int, optional) : resolution in dpi. Defaults to 300.
Raises:
RuntimeError: when cannot write to disk at specified path
Returns:
numpy.ndarray: synthetic image
"""
with open(full_text_path, "r", encoding="utf8") as f: # read file
text = f.read()

Просмотреть файл

@ -35,17 +35,15 @@ def _align_seg(
calls the sequence alignment algorithm (Needleman-Wunsch)
Arguments:
gt {str} -- a ground truth string
noise {str} -- a string with ocr noise
Keyword Arguments:
match_reward {int} -- reward for matching characters (default: {MATCH_REWARD})
mismatch_pen {int} -- penalty for mistmatching characters (default: {MISMATCH_PENALTY})
gap_pen {int} -- penalty for creating a gap (default: {GAP_PENALTY})
gap_ext_pen {int} -- penalty for extending a gap (default: {GAP_EXT_PENALTY})
gt (str) : a ground truth string
noise (str) : a string with ocr noise
match_reward (int, optional) : reward for matching characters. Defaults to ``MATCH_REWARD``.
mismatch_pen (int, optional) : penalty for mistmatching characters. Defaults to ``MISMATCH_PENALTY``.
gap_pen (int, optional) : penalty for creating a gap. Defaults to ``GAP_PENALTY``.
gap_ext_pen (int, optional) : penalty for extending a gap. Defaults to ``GAP_EXT_PENALTY``.
Returns:
list -- a list of alignment tuples. Each alignment tuple
list : a list of alignment tuples. Each alignment tuple
is one possible alignment candidate.
A tuple (str, str, int, int, int) contains the following information:
@ -111,9 +109,9 @@ def _select_alignment_candidates(alignments, target_num_gt_tokens):
This method is to search for such candidate that satisfy the invariant.
Arguments:
alignments {list} -- a list of alignment tuples as follows:
alignments (list) : a list of alignment tuples as follows:
[(str1, str2, alignment_score, alignment_start, alignment_end), (str1, str2, ...), ...]
target_num_gt_tokens {int} -- the number of token in the aligned ground truth string should have
target_num_gt_tokens (int) : the number of token in the aligned ground truth string should have
Raises:
ValueError: raises this error if
@ -146,25 +144,26 @@ def _select_alignment_candidates(alignments, target_num_gt_tokens):
def align(gt, noise, gap_char=GAP_CHAR):
"""Align two text segments via sequence alignment algorithm
NOTE: this algorithm is O(N^2) and is NOT efficient for longer text.
**NOTE**: this algorithm is O(N^2) and is NOT efficient for longer text.
Please refer to `genalog.text.anchor` for faster alignment on longer strings.
Arguments:
gt {str} -- ground true text (should not contain GAP_CHAR)
noise {str} -- str with ocr noise (should not contain GAP_CHAR)
Keyword Arguments:
gap_char {char} -- gap char used in alignment algorithm (default: {GAP_CHAR})
gt (str) : ground true text (should not contain GAP_CHAR)
noise (str) : str with ocr noise (should not contain GAP_CHAR)
gap_char (char, optional) : gap char used in alignment algorithm (default: GAP_CHAR)
Returns:
a tuple (str, str) of aligned ground truth and noise:
(aligned_gt, aligned_noise)
tuple(str, str) : a tuple of aligned ground truth and noise
Invariants:
The returned aligned strings will satisfy the following invariants:
1. len(aligned_gt) == len(aligned_noise)
2. number of tokens in gt == number of tokens in aligned_gt
For example:
1. ``len(aligned_gt) == len(aligned_noise)``
2. ``number of tokens in gt == number of tokens in aligned_gt``
Example:
::
gt: "New York is big" (num_tokens = 4)
aligned_gt: "N@ew @@York @is big@@" (num_tokens = 4)
@ -193,8 +192,8 @@ def _format_alignment(align1, align2):
"""Wrapper function for Bio.pairwise2.format_alignment()
Arguments:
align1 {str} -- alignment str
align2 {str} -- second str for alignment
align1 (str) : alignment str
align2 (str) : second str for alignment
Returns:
a string with formatted alignment.
@ -221,8 +220,8 @@ def _find_token_start(s, index):
"""Find the position of the start of token
Arguments:
s {str} -- string to search in
index {int} -- index to begin search from
s (str) : string to search in
index (int) : index to begin search from
Returns:
- position {int} of the first non-whitespace character
@ -250,8 +249,8 @@ def _find_token_end(s, index):
So, for single character string (eg. "c"), it will return 0.
Arguments:
s {str} -- string to search in
index {int} -- index to begin search from
s (str) : string to search in
index (int) : index to begin search from
Returns:
- position {int} of the first non-whitespace character
@ -279,13 +278,14 @@ def _find_next_token(s, start):
So, for single character string (eg. "c"), it will return (0,0)
Arguments:
s {str} -- the string to search token in
start {int} -- the starting index to start search in
s (str) : the string to search token in
start (int) : the starting index to start search in
Returns:
a tuple of (int, int) responding to the start and end indices of
a token in the given s.
"""
token_start = _find_token_start(s, start)
token_end = _find_token_end(s, token_start)
return token_start, token_end
@ -301,13 +301,11 @@ def _is_valid_token(token, gap_char=GAP_CHAR):
**Important: this method expects one token and not multiple space-separated tokens
Arguments:
token {str} -- input string token
Keyword Arguments:
gap_char {char} -- gap char used in alignment algorithm (default: {GAP_CHAR})
token (str) : input string token
gap_char (char, optional) : gap char used in alignment algorithm. Defaults to GAP_CHAR.
Returns:
bool-- True if is a valid token, false otherwise
bool : True if is a valid token, false otherwise
"""
# Matches multiples of 'gap_char' that are padded with whitespace characters on either end
INVALID_TOKEN_REGEX = (
@ -317,42 +315,41 @@ def _is_valid_token(token, gap_char=GAP_CHAR):
def parse_alignment(aligned_gt, aligned_noise, gap_char=GAP_CHAR):
"""Parse alignment to pair ground truth tokens with noise tokens
r"""Parse alignment to pair ground truth tokens with noise tokens
::
Case 1: Case 2: Case 3: Case 4: Case 5:
one-to-many many-to-one many-to-many missing tokens one-to-one
(Case 1&2 Composite)
gt "New York" "New York" "New York" "New York" "New York"
| | | | | | | | | |
aligned_gt "New Yo@rk" "New York" "N@ew York" "New York" "New York"
| /\\ \\/ /\\/ | | | |
aligned_noise "New Yo rk" "New@York" "N ew@York" "New @@@@" "New York"
| /\ \/ /\/ | | | |
aligned_noise "New Yo rk" "New@York" "N ew@York" "New @@@@" "New York"
| | | | | | | | |
noise "New Yo rk" "NewYork" "N ewYork" "New" "New York"
Arguments:
aligned_gt {str} -- ground truth string aligned with the nose string
aligned_noise {str} -- noise string aligned with the ground truth
Keyword Arguments:
gap_char {char} -- gap char used in alignment algorithm (default: {GAP_CHAR})
aligned_gt (str) : ground truth string aligned with the nose string
aligned_noise (str) : noise string aligned with the ground truth
gap_char (char, optional) : gap char used in alignment algorithm. Defaults to GAP_CHAR.
Returns:
a tuple (list, list) of two 2D int arrays as follows:
tuple -- a tuple ``(gt_to_noise_mapping, noise_to_gt_mapping)`` of two 2D int arrays:
(gt_to_noise_mapping, noise_to_gt_mapping)
where each array defines the mapping between aligned gt tokens
to noise tokens and vice versa.
where each array defines the mapping between aligned gt tokens
to noise tokens and vice versa.
Example:
Given input
::
For example:
Given input
aligned_gt: "N@ew York @is big"
aligned_gt: "N@ew York @is big"
/\\ | | |
aligned_noise: "N ew@York kis big."
The returned output will be:
The returned output will be:
::
([[0,1],[1],[2],[3]], [[0],[0,1],[2],[3]])
"""
# Pseudo-algorithm:

Просмотреть файл

@ -1,16 +1,16 @@
"""
Baseline alignment algorithm is slow on long documents.
The idea is to break down the longer text into smaller fragments
for quicker alignment on individual pieces. We refer "anchor words"
as these points of breakage.
Baseline alignment algorithm is slow on long documents.
The idea is to break down the longer text into smaller fragments
for quicker alignment on individual pieces. We refer "anchor words"
as these points of breakage.
The bulk of this algorithm is to identify these "anchor words".
The bulk of this algorithm is to identify these "anchor words".
This is an re-implementation of the algorithm in this paper
"A Fast Alignment Scheme for Automatic OCR Evaluation of Books"
(https://ieeexplore.ieee.org/document/6065412)
This is an re-implementation of the algorithm in this paper
"A Fast Alignment Scheme for Automatic OCR Evaluation of Books"
(https://ieeexplore.ieee.org/document/6065412)
We rely on `genalog.text.alignment` to align the subsequences.
We rely on `genalog.text.alignment` to align the subsequences.
"""
import itertools
from collections import Counter
@ -29,14 +29,12 @@ def get_unique_words(tokens, case_sensitive=False):
"""Get a set of unique words from a Counter dictionary of word occurrences
Arguments:
d {dict} -- a Counter dictionary of word occurrences
Keyword Arguments:
case_sensitive {bool} -- whether unique words are case sensitive
(default: {False})
d (dict) : a Counter dictionary of word occurrences
case_sensitive (bool, optional) : whether unique words are case sensitive.
Defaults to False.
Returns:
a set of unique words (original alphabetical case of the word is preserved)
set: a set of unique words (original alphabetical case of the word is preserved)
"""
if case_sensitive:
word_count = Counter(tokens)
@ -51,9 +49,9 @@ def segment_len(tokens):
"""Get length of the segment
Arguments:
segment {list} -- a list of tokens
segment (list) : a list of tokens
Returns:
int -- the length of the segment
int : the length of the segment
"""
return sum(map(len, tokens))
@ -62,14 +60,14 @@ def get_word_map(unique_words, src_tokens):
"""Arrange the set of unique words by the order they original appear in the text
Arguments:
unique_words {set} -- a set of unique words
src_tokens {list} -- a list of tokens
unique_words (set) : a set of unique words
src_tokens (list) : a list of tokens
Returns:
list -- a `word_map`: a list of word corrdinate tuples (str, int) defined as follow:
(word, word_index)
1. `word` is a typical word token
2. `word_index` is the index of the word in the source token array
list : a ``word_map``: a list of word corrdinate tuples ``(word, word_index)`` defined as follow:
1. ``word`` is a typical word token
2. ``word_index`` is the index of the word in the source token array
"""
# Find the indices of the unique words in the source text
unique_word_indices = map(src_tokens.index, unique_words)
@ -84,18 +82,20 @@ def get_anchor_map(gt_tokens, ocr_tokens, min_anchor_len=2):
and ocr text into smaller text fragment for faster alignment.
Arguments:
gt_tokens {list} -- a list of ground truth tokens
ocr_tokens {list} -- a list of tokens from OCR'ed document
Keyword Arguments:
min_anchor_len {int} -- minimum len of the anchor word
(default: {2})
gt_tokens (list) : a list of ground truth tokens
ocr_tokens (list) : a list of tokens from OCR'ed document
min_anchor_len (int, optional) : minimum len of the anchor word.
Defaults to 2.
Returns:
tuple -- a 2-element tuple (list, list) defined as follow:
(anchor_map_gt, anchor_map_ocr)
1. `anchor_map_gt` is a `word_map` that locates all the anchor words in the gt tokens
2. `anchor_map_gt` is a `word_map` that locates all the anchor words in the ocr tokens
tuple: a 2-element ``(anchor_map_gt, anchor_map_ocr)`` tuple:
1. ``anchor_map_gt`` is a ``word_map`` that locates all the anchor words in the gt tokens
2. ``anchor_map_gt`` is a ``word_map`` that locates all the anchor words in the ocr tokens
And ``len(anchor_map_gt) == len(anchor_map_ocr)``
::
For example:
Input:
@ -103,8 +103,7 @@ def get_anchor_map(gt_tokens, ocr_tokens, min_anchor_len=2):
ocr_tokens: ["c", "b", "a"]
Ourput:
([("b", 0), ("a", 1)], [("b", 1), ("a", 2)])
Invariant:
1. len(anchor_map_gt) == len(anchor_map_ocr)
"""
# 1. Get unique words common in both gt and ocr
unique_words_gt = get_unique_words(gt_tokens)
@ -160,23 +159,19 @@ def find_anchor_recur(
"""Recursively find anchor positions in the gt and ocr text
Arguments:
gt_tokens {list} -- a list of ground truth tokens
ocr_tokens {list} -- a list of tokens from OCR'ed document
Keyword Arguments:
start_pos {int} -- a constant to add to all the resulting indices
(default: {0})
max_seg_length {int} -- trigger recursion if any text segment is larger than this
(default: {MAX_ALIGN_SEGMENT_LENGTH})
gt_tokens (list) : a list of ground truth tokens
ocr_tokens (list) : a list of tokens from OCR'ed document
start_pos (int, optional) : a constant to add to all the resulting indices.
Defaults to 0.
max_seg_length (int, optional) : trigger recursion if any text segment is larger than this.
Defaults to ``MAX_ALIGN_SEGMENT_LENGTH``.
Raises:
ValueError: when there different number of anchor points in gt and ocr.
Returns:
tuple -- two lists of token indices:
(output_gt_anchors, output_ocr_anchors)
where each list is the position of the anchor in the input
`gt_tokens` and `ocr_tokens`
tuple : two lists of token indices where each list is the position of the anchor in the input
``gt_tokens`` and ``ocr_tokens``
"""
# 1. Try to find anchor words
anchor_word_map_gt, anchor_word_map_ocr = get_anchor_map(gt_tokens, ocr_tokens)
@ -234,37 +229,37 @@ def align_w_anchor(gt, ocr, gap_char=GAP_CHAR, max_seg_length=MAX_ALIGN_SEGMENT_
breaks the strings into smaller segments with anchor words.
Then these smaller segments are aligned.
NOTE: this function shares the same contract as `genalog.text.alignment.align()`
**NOTE:** this function shares the same contract as `genalog.text.alignment.align()`
These two methods are interchangeable and their alignment results should be similar.
For example:
::
Ground Truth: "The planet Mars, I scarcely need remind the reader,"
Noisy Text: "The plamet Maris, I scacely neee remind te reader,"
For example:
Here the unique anchor words are "I", "remind" and "reader".
Ground Truth: "The planet Mars, I scarcely need remind the reader,"
Noisy Text: "The plamet Maris, I scacely neee remind te reader,"
Thus, the algorithm will split into following segment pairs:
Here the unique anchor words are "I", "remind" and "reader".
"The planet Mar, "
"The plamet Maris, "
Thus, the algorithm will split into following segment pairs:
"I scarcely need "
"I scacely neee "
"The planet Mar, "
"The plamet Maris, "
"remind the reader,"
"remind te reader,"
"I scarcely need "
"I scacely neee "
And run sequence alignment on each pair.
"remind the reader,"
"remind te reader,"
And run sequence alignment on each pair.
Arguments:
gt {str} -- ground truth text
noise {str} -- text with ocr noise
Keyword Argument:
gap_char {str} -- gap char used in alignment algorithm (default: {GAP_CHAR})
max_seg_length {int} -- maximum segment length. Segments longer than this threshold
will continued be split recursively into smaller segment.
gt (str) : ground truth text
noise (str) : text with ocr noise
gap_char (str, optional) : gap char used in alignment algorithm . Defaults to GAP_CHAR.
max_seg_length (int, optional) : maximum segment length. Segments longer than this threshold
will continued be split recursively into smaller segment. Defaults to ``MAX_ALIGN_SEGMENT_LENGTH``.
Returns:
a tuple (str, str) of aligned ground truth and noise:

Просмотреть файл

@ -1,38 +1,41 @@
"""This is a utility tool to create CoNLL-formatted token+label files for OCR'ed text
by extracting text from grok OCR output JSON files
and propagating labels from clean text to OCR text.
by extracting text from grok OCR output JSON files and propagating labels from clean
text to OCR text.
usage: conll_format.py [-h] [--train_subset] [--test_subset]
[--gt_folder GT_FOLDER]
base_folder degraded_folder
Usage:
::
positional argument:
base_folder base directory containing the collection of dataset
degraded_folder directory name containing train and test subset for degradation
conll_format.py [-h] [--train_subset] [--test_subset]
[--gt_folder GT_FOLDER]
base_folder degraded_folder
optional arguments:
--train_subset include if only train directory should be processed
--test_subset include if only test directory should be processed
--gt_folder GT_FOLDER directory name containing ground truth (default to `shared`)
Positional Argument:
base_folder base directory containing the collection of dataset
degraded_folder directory name containing train and test subset for degradation
optional arguments:
-h, --help show this help message and exit
Optional Arguments:
--train_subset include if only train directory should be processed
--test_subset include if only test directory should be processed
--gt_folder GT_FOLDER directory name containing ground truth (default to `shared`)
example usage
(to run for specified degradation of the dataset on both train and test)
Seek Help:
-h, --help show this help message and exit
Example Usage:
.. code-block:: shell
# to run for specified degradation of the dataset on both train and test
python -m genalog.text.conll_format '/data/enki/datasets/synthetic_dataset/' 'hyphens_all'
(to run for specified degradation of the dataset and ground truth)
python -m genalog.text.conll_format '/data/enki/datasets/synthetic_dataset/' 'hyphens_all'
--gt_folder='shared'
# to run for specified degradation of the dataset and ground truth
python -m genalog.text.conll_format '/data/enki/datasets/synthetic_dataset/' 'hyphens_all' --gt_folder='shared'
(to run for specified degradation of the dataset on only test subset)
python -m genalog.text.conll_format '/data/enki/datasets/synthetic_dataset/' 'hyphens_all'
--test_subset
# to run for specified degradation of the dataset on only test subset
python -m genalog.text.conll_format '/data/enki/datasets/synthetic_dataset/' 'hyphens_all' --test_subset
(to run for specified degradation of the dataset on only train subset)
python -m genalog.text.conll_format '/data/enki/datasets/synthetic_dataset/' 'hyphens_all'
--train_subset
# to run for specified degradation of the dataset on only train subset
python -m genalog.text.conll_format '/data/enki/datasets/synthetic_dataset/' 'hyphens_all' --train_subset
"""
import argparse
import concurrent.futures
@ -303,10 +306,10 @@ def check_n_sentences(clean_labels_dir, output_labels_dir, clean_label_ext):
----------
clean_labels_dir : str
path of directory with clean labels -
CoNLL formatted so contains tokens and corresponding labels
CoNLL formatted so contains tokens and corresponding labels
output_labels_dir : str
path of directory with ocr labels -
CoNLL formatted so contains tokens and corresponding labels
CoNLL formatted so contains tokens and corresponding labels
"""
text_files = os.listdir(output_labels_dir)
skip_files = []

Просмотреть файл

@ -52,7 +52,7 @@ def _convert_to_begin_label(label):
"""Convert an inside label, or I-label, (ex. I-PLACE) to a begin label, or B-Label, (ex. B-PLACE)
Arguments:
label {str} -- an NER label
label (str) : an NER label
Returns:
an NER label. This method DOES NOT alter the label unless it is an inside label
@ -67,7 +67,7 @@ def _convert_to_inside_label(label):
"""Convert a begin label, or B-label, (ex. B-PLACE) to an inside label, or I-Label, (ex. B-PLACE)
Arguments:
label {str} -- an NER label
label (str) : an NER label
Returns:
an NER label. This method DOES NOT alter the label unless it is a begin label
@ -82,9 +82,9 @@ def _is_missing_begin_label(begin_label, inside_label):
"""Validate a inside label given an begin label
Arguments:
begin_label {str} -- a begin NER label used to
begin_label (str) : a begin NER label used to
check if the given label is part of a multi-token label
inside_label {str} -- an inside label to check for its validity
inside_label (str) : an inside label to check for its validity
Returns:
True if the inside label paired with the begin_label. False otherwise.
@ -111,7 +111,7 @@ def correct_ner_labels(labels):
1. Missing B-Label (i.e. I-PLACE I-PLACE -> B-PLACE I-PLACE)
Arguments:
labels {list} -- list of NER labels
labels (list) : list of NER labels
Returns:
a list of NER labels
@ -152,7 +152,7 @@ def _select_from_multiple_ner_labels(label_indices):
Currently the FIRST label takes precedence.
Arguments:
label_indices {list} -- a list of token indices
label_indices (list) : a list of token indices
Returns:
a specific index
@ -165,8 +165,8 @@ def _find_gap_char_candidates(gt_tokens, ocr_tokens):
"""Find a set of suitable GAP_CHARs based not in the set of input characters
Arguments:
gt_tokens {list} -- a list of tokens
ocr_tokens {list} -- a list of tokens
gt_tokens (list) : a list of tokens
ocr_tokens (list) : a list of tokens
Returns:
(set, set) -- a 2-element tuple of
@ -192,14 +192,12 @@ def propagate_label_to_ocr(gt_labels, gt_tokens, ocr_tokens, use_anchor=True):
4. string with spaces (" ")
Arguments:
gt_labels {list} -- a list of NER label for ground truth token
gt_tokens {list} -- a list of ground truth string tokens
ocr_tokens {list} -- a list of OCR'ed text tokens
Keyword Arguments:
gap_char {char} -- gap char used in alignment algorithm (default: {alignment.GAP_CHAR})
use_anchor {bool} -- use faster alignment method with anchors if set to True
(default: {True})
gt_labels (list) : a list of NER label for ground truth token
gt_tokens (list) : a list of ground truth string tokens
ocr_tokens (list) : a list of OCR'ed text tokens
gap_char (char, optional) : gap char used in alignment algorithm. Defaults to ``alignment.GAP_CHAR``.
use_anchor (bool, optional) : use faster alignment method with anchors if set to True
. Defaults to True.
Raises:
GapCharError:
@ -207,13 +205,12 @@ def propagate_label_to_ocr(gt_labels, gt_tokens, ocr_tokens, use_anchor=True):
to set of all possible gap characters (GAP_CHAR_SET)
Returns:
a tuple of 3 elements:
(ocr_labels, aligned_gt, aligned_ocr, gap_char)
tuple : a tuple of 3 elements ``(ocr_labels, aligned_gt, aligned_ocr, gap_char)``
where
`ocr_labels` is a list of NER label for the corresponding ocr tokens
`aligned_gt` is the ground truth string aligned with the ocr text
`aligned_ocr` is the ocr text aligned with ground true
`gap_char` is the char used to alignment for inserting gaps
1. ``ocr_labels`` is a list of NER label for the corresponding ocr tokens
2. ``aligned_gt`` is the ground truth string aligned with the ocr text
3. ``aligned_ocr`` is the ocr text aligned with ground true
4. ``gap_char`` is the char used to alignment for inserting gaps
"""
# Find a set of suitable GAP_CHAR based not in the set of input characters
gap_char_candidates, input_char_set = _find_gap_char_candidates(
@ -242,11 +239,13 @@ def _propagate_label_to_ocr(
"""Propagate NER label for ground truth tokens to to ocr tokens. Low level implementation
NOTE: that `gt_tokens` and `ocr_tokens` MUST NOT contain invalid tokens.
Invalid tokens are:
1. non-atomic tokens, or space-separated string ("New York")
2. multiple occurrences of the GAP_CHAR ('@@@')
3. empty string ("")
4. string with spaces (" ")
Invalid tokens are:
1. non-atomic tokens, or space-separated string ("New York")
2. multiple occurrences of the GAP_CHAR ('@@@')
3. empty string ("")
4. string with spaces (" ")
::
Case Analysis:
******************************** MULTI-TOKEN-LABELS ********************************
@ -276,14 +275,12 @@ def _propagate_label_to_ocr(
ocr label o o o V O O V O
Arguments:
gt_labels {list} -- a list of NER label for ground truth token
gt_tokens {list} -- a list of ground truth string tokens
ocr_tokens {list} -- a list of OCR'ed text tokens
Keyword Arguments:
gap_char {char} -- gap char used in alignment algorithm (default: {alignment.GAP_CHAR})
use_anchor {bool} -- use faster alignment method with anchors if set to True
(default: {True})
gt_labels (list) : a list of NER label for ground truth token
gt_tokens (list) : a list of ground truth string tokens
ocr_tokens (list) : a list of OCR'ed text tokens
gap_char (char, optional) : gap char used in alignment algorithm . Defaults to ``alignment.GAP_CHAR``.
use_anchor (bool, optional) : use faster alignment method with anchors if set to True.
Defaults to True.
Raises:
ValueError: when
1. there is unequal number of gt_tokens and gt_labels
@ -439,27 +436,26 @@ def format_labels(tokens, labels, label_top=True):
"""Format tokens and their NER label for display
Arguments:
tokens {list} -- a list of word tokens
labels {list} -- a list of NER labels
Keyword Arguments:
label_top {bool} -- True if label is place on top of the token
(default: {True})
tokens (list) : a list of word tokens
labels (list) : a list of NER labels
label_top (bool, optional) : True if label is place on top of the token.
Defaults to True.
Returns:
a str with NER label align to the token it is labeling
For example:
Given inputs:
tokens: ["New", "York", "is", "big"]
labels: ["B-place", "I-place", "o", "o"]
label_top: True
::
Given inputs:
tokens: ["New", "York", "is", "big"]
labels: ["B-place", "I-place", "o", "o"]
label_top: True
Outputs:
\"B-place I-place o o \"
\"New York is big\"
Outputs:
"
B-place I-place o o \n
New York is big \n
"
"""
formatted_tokens = ""
formatted_labels = ""
@ -492,36 +488,36 @@ def format_label_propagation(
"""Format label propagation for display
Arguments:
gt_tokens {list} -- list of ground truth tokens
gt_labels {list} -- list of NER labels for ground truth tokens
ocr_tokens {list} -- list of OCR'ed text tokens
ocr_labels {list} -- list of NER labels for the OCR'ed tokens
aligned_gt {str} -- ground truth string aligned with the OCR'ed text
aligned_ocr {str} -- OCR'ed text aligned with ground truth
Keyword Arguments:
show_alignment {bool} -- if true, show alignment result (default: {True})
gt_tokens (list) : list of ground truth tokens
gt_labels (list) : list of NER labels for ground truth tokens
ocr_tokens (list) : list of OCR'ed text tokens
ocr_labels (list) : list of NER labels for the OCR'ed tokens
aligned_gt (str) : ground truth string aligned with the OCR'ed text
aligned_ocr (str) : OCR'ed text aligned with ground truth
show_alignment (bool, optional) : if true, show alignment result . Defaults to True.
Returns:
a string formatted for display as follows:
str: a string formatted for display as follows:
.. code-block:: python
if show_alignment:
"B-PLACE I-PLACE V O" # [gt_labels]
"New York is big" # [gt_txt]
"New York is big" # [aligned_gt]
"||||....|||||||"
"New @@@@ is big" # [aligned_ocr]
"New is big " # [ocr_txt]
"B-PLACE V O " # [ocr_labels]
else:
"B-PLACE I-PLACE V O" # [gt_labels]
"New York is big" # [gt_txt]
"New is big" # [ocr_txt]
"B-PLACE V O" # [ocr_labels]
if show_alignment=TRUE
"
B-PLACE I-PLACE V O [gt_labels]
New York is big [gt_txt]
New York is big [aligned_gt]
||||....|||||||
New @@@@ is big [aligned_ocr]
New is big [ocr_txt]
B-PLACE V O [ocr_labels]
"
else
"
B-PLACE I-PLACE V O [gt_labels]
New York is big [gt_txt]
New is big [ocr_txt]
B-PLACE V O [ocr_labels]
"
"""
gt_label_str = format_labels(gt_tokens, gt_labels)

Просмотреть файл

@ -8,10 +8,9 @@ def remove_non_ascii(token, replacement=NON_ASCII_REPLACEMENT):
"""Remove non ascii characters in a token
Arguments:
token {str} -- a word token
Keyword Arguments:
replacement {str} -- a replace character for non-ASCII characters
(Default: {NON_ASCII_REPLACEMENT})
token (str) : a word token
replacement (str, optional) : a replace character for non-ASCII characters.
Defaults to ``NON_ASCII_REPLACEMENT``.
Returns:
str -- a word token with non-ASCII characters removed
"""
@ -27,7 +26,7 @@ def tokenize(s):
"""Tokenize string
Arguments:
s {str} -- aligned string
s (str) : aligned string
Returns:
a list of tokens
@ -40,7 +39,7 @@ def join_tokens(tokens):
"""Join a list of tokens into a string
Arguments:
tokens {list} -- a list of tokens
tokens (list) : a list of tokens
Returns:
a string with space-separated tokens

Просмотреть файл

@ -1,11 +1,11 @@
import os
import glob
import os
import numpy as np
import pytest
from genalog.pipeline import AnalogDocumentGeneration, generate_dataset_multiprocess
from genalog.generation.document import DocumentGenerator
from genalog.pipeline import AnalogDocumentGeneration, generate_dataset_multiprocess
EXAMPLE_TEXT_FILE = "tests/unit/text/data/gt_1.txt"
INPUT_TEXT_FILENAMES = glob.glob("tests/unit/text/data/gt_*.txt")