diff --git a/README.md b/README.md index 5461464..ef7c67a 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ Genalog is an open source, cross-platform python package for **gen**erating document images with synthetic noise that mimics scanned an**alog** documents (thus the name `genalog`). You can also add various text degradations to these images. The purpose of this tool is to provide a fast and efficient way to generate synthetic documents from text data by leveraging layout from templates that you create in simple HTML format. +![demo-gif](docs/genalog_docs/static/genalog_demo.gif) + Overview ------------------------------------- Genalog has various capabilities: diff --git a/docs/genalog_docs/index.md b/docs/genalog_docs/index.md index 26939a7..0126661 100644 --- a/docs/genalog_docs/index.md +++ b/docs/genalog_docs/index.md @@ -11,6 +11,11 @@ pip install genalog `genalog` is an open source, cross-platform python package for **gen**erating document images with synthetic noise that mimics scanned an**alog** documents (thus the name `genalog`). You can also add various text degradations to these images. The purpose of this tool is to provide a fast and efficient way to generate synthetic documents from text data by leveraging layout from templates that you can create in simple HTML format. +```{figure} static/genalog_demo.gif +:width: 80% +Generate documents and apply degradations +``` + `genalog` provides several document templates as a start. You can alter the document layout using standard CSS properties like `font-family`, `font-size`, `text-align`, etc. Here are some of the example generated documents: ````{tab} Multi-Column diff --git a/docs/genalog_docs/static/genalog_demo.gif b/docs/genalog_docs/static/genalog_demo.gif new file mode 100644 index 0000000..d9224ef Binary files /dev/null and b/docs/genalog_docs/static/genalog_demo.gif differ diff --git a/example/demo_generate.py b/example/demo_generate.py new file mode 100644 index 0000000..04b9081 --- /dev/null +++ b/example/demo_generate.py @@ -0,0 +1,38 @@ +#%% +from genalog.pipeline import AnalogDocumentGeneration +from genalog.degradation.degrader import ImageState + +sample_text = "sample/generation/example.txt" + +# Common CSS properties +STYLE_COMBINATIONS = { + "font_family" : ["Times"], # sans-serif, Times, monospace, etc + "font_size" : ["12px"], + "text_align" : ["justify"], # left, right, center, justify + "language" : ["en_US"], # controls how words are hyphenated + "hyphenate" : [True], +} +# .html.jinja +HTML_TEMPLATE = "columns.html.jinja" +# Degration effects applied in sequence +DEGRADATIONS = [ + ("blur", {"radius": 5}), # needs to be an odd number + ("bleed_through", { + "src": ImageState.CURRENT_STATE, "background": ImageState.ORIGINAL_STATE, + "alpha": 0.8, + "offset_y": 9, "offset_x": 12 + }), + ("morphology", {"operation": "open", "kernel_shape":(5,5)}), + ("pepper", {"amount": 0.05}), + ("salt", {"amount": 0.2}), +] + +doc_generation = AnalogDocumentGeneration(styles=STYLE_COMBINATIONS, degradations=DEGRADATIONS) +img_array = doc_generation.generate_img(sample_text, HTML_TEMPLATE, target_folder=None) + +import cv2 +from IPython.core.display import Image, display + +_, encoded_image = cv2.imencode('.png', img_array) +display(Image(data=encoded_image, width=600)) +