diff --git a/genalog/pipeline.py b/genalog/pipeline.py index ef3f410..793efe1 100644 --- a/genalog/pipeline.py +++ b/genalog/pipeline.py @@ -22,12 +22,9 @@ class ImageStateEncoder(JSONEncoder): class AnalogDocumentGeneration(object): def __init__( - self, - template_path=None, - styles=DEFAULT_STYLE_COMBINATION, - degradations=[], - resolution=300, - ): + self, + template_path=None, styles=DEFAULT_STYLE_COMBINATION, + degradations=[], resolution=300): self.doc_generator = DocumentGenerator(template_path=template_path) self.doc_generator.set_styles_to_generate(styles) self.degrader = Degrader(degradations) @@ -42,8 +39,13 @@ class AnalogDocumentGeneration(object): """ return self.doc_generator.template_list + # Fix: rename to generate_sample() + # Add another method called generate_all_styles() def generate_img(self, full_text_path, template, target_folder=None): - """Generate synthetic images given the filepath of a text document + """Generate a image with a sample style given a text document + + NOTE: This does not generate all possible style combinations. + Use generate_all_styles() instead. Arguments: full_text_path {str} -- full filepath of a text document (i.e /dataset/doc.txt) @@ -54,6 +56,9 @@ class AnalogDocumentGeneration(object): target_folder {str} -- folder path in which the generated images are stored (default: {None}) resolution {int} -- resolution in dpi (default: {300}) + + Raises: + RuntimeError: when cannot write to disk at specified path """ with open(full_text_path, "r", encoding="utf8") as f: # read file text = f.read() @@ -61,7 +66,10 @@ class AnalogDocumentGeneration(object): generator = self.doc_generator.create_generator(content, [template]) # Generate the image - doc = next(generator) # TODO: this does not exhaust all of the style combinations in the generator + try: + doc = next(generator) # NOTE: this does not exhaust all of the style combinations in the generator + except StopIteration: + return None src = doc.render_array(resolution=self.resolution, channel="GRAYSCALE") # Degrade the image dst = self.degrader.apply_effects(src) @@ -74,7 +82,8 @@ class AnalogDocumentGeneration(object): text_filename = os.path.basename(full_text_path) img_filename = text_filename.replace(".txt", ".png") img_dst_path = os.path.join(target_folder, "img", img_filename) - cv2.imwrite(img_dst_path, dst) + if not cv2.imwrite(img_dst_path, dst): + raise RuntimeError(f"Could not write to path {img_dst_path}") return @@ -115,14 +124,9 @@ def _set_batch_generate_args( def generate_dataset_multiprocess( - input_text_files, - output_folder, - styles, - degradations, - template, - resolution=300, - batch_size=25, -): + input_text_files, output_folder, + styles, degradations, template, + resolution=300, batch_size=25): _setup_folder(output_folder) print(f"Storing generated images in {output_folder}") diff --git a/requirements-dev.txt b/requirements-dev.txt index 5aa0690..51eb4f2 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,4 +2,6 @@ flake8 flake8-import-order pytest pytest-cov +pytest-mock +pytest-lazy-fixture tox diff --git a/tests/e2e/test_pipeline.py b/tests/e2e/test_pipeline.py index 3d01e72..fae3910 100644 --- a/tests/e2e/test_pipeline.py +++ b/tests/e2e/test_pipeline.py @@ -1,9 +1,10 @@ import os import glob +import numpy as np import pytest -from genalog import pipeline +from genalog.pipeline import AnalogDocumentGeneration, generate_dataset_multiprocess from genalog.generation.document import DocumentGenerator EXAMPLE_TEXT_FILE = "tests/unit/text/data/gt_1.txt" @@ -18,33 +19,69 @@ DEGRATIONS = [ @pytest.fixture -def default_analog_generator(): - return pipeline.AnalogDocumentGeneration() +def default_doc_generator(): + return AnalogDocumentGeneration() @pytest.fixture -def custom_analog_generator(): - return pipeline.AnalogDocumentGeneration( - styles=STYLES, degradations=DEGRATIONS, resolution=300 - ) +def custom_doc_generator(): + return AnalogDocumentGeneration(styles=STYLES, degradations=DEGRATIONS, resolution=300) -def test_default_generate_img(default_analog_generator): - assert len(default_analog_generator.list_templates()) > 0 - example_template = default_analog_generator.list_templates()[0] - default_analog_generator.generate_img( +@pytest.fixture +def empty_style_doc_generator(): + return AnalogDocumentGeneration(styles={}) + + +@pytest.mark.parametrize("doc_generator", [ + pytest.lazy_fixture('default_doc_generator'), + pytest.lazy_fixture('custom_doc_generator') +]) +def test_generate_img_array(doc_generator): + # Precondition checks + assert len(doc_generator.list_templates()) > 0 + + example_template = doc_generator.list_templates()[0] + sample_img = doc_generator.generate_img( EXAMPLE_TEXT_FILE, example_template, target_folder=None ) + assert sample_img is not None + assert isinstance(sample_img, np.ndarray) -def test_custom_generate_img(custom_analog_generator): - assert len(custom_analog_generator.list_templates()) > 0 - example_template = custom_analog_generator.list_templates()[0] - custom_analog_generator.generate_img( +def test_generate_img_array_empty(empty_style_doc_generator): + # Precondition checks + assert len(empty_style_doc_generator.list_templates()) > 0 + + example_template = empty_style_doc_generator.list_templates()[0] + sample_img = empty_style_doc_generator.generate_img( EXAMPLE_TEXT_FILE, example_template, target_folder=None ) + assert sample_img is None +@pytest.mark.io +@pytest.mark.parametrize("doc_generator", [ + pytest.lazy_fixture('default_doc_generator'), + pytest.lazy_fixture('custom_doc_generator') +]) +def test_generate_img_write_to_disk(tmpdir, doc_generator): + os.makedirs(os.path.join(tmpdir, "img")) # TODO: generate_img() store image under "img" folder + output_img_wildcard = os.path.join(tmpdir, "img", "*.png") + num_generated_img = glob.glob(output_img_wildcard) + # Precondition checks + assert len(num_generated_img) == 0 + assert len(doc_generator.list_templates()) > 0 + + example_template = doc_generator.list_templates()[0] + doc_generator.generate_img( + EXAMPLE_TEXT_FILE, example_template, target_folder=tmpdir + ) + num_generated_img = glob.glob(output_img_wildcard) # look for any jpg on file + assert len(num_generated_img) > 0 + + +@pytest.mark.io @pytest.mark.parametrize("styles", [ STYLES, pytest.param( @@ -56,9 +93,9 @@ def test_custom_generate_img(custom_analog_generator): def test_generate_dataset_multiprocess(tmpdir, folder_name, styles): assert len(INPUT_TEXT_FILENAMES) > 0 output_folder = os.path.join(tmpdir, folder_name) - pipeline.generate_dataset_multiprocess( + generate_dataset_multiprocess( INPUT_TEXT_FILENAMES, output_folder, styles, DEGRATIONS, "text_block.html.jinja" ) - num_generated_img = glob.glob(os.path.join(output_folder, "**/*.png")) + num_generated_img = glob.glob(os.path.join(output_folder, "**", "*.png")) assert len(num_generated_img) > 0 assert len(num_generated_img) == len(INPUT_TEXT_FILENAMES) * len(DocumentGenerator.expand_style_combinations(styles)) diff --git a/tox.ini b/tox.ini index 6d02f37..2961cdf 100644 --- a/tox.ini +++ b/tox.ini @@ -34,6 +34,7 @@ markers = # EX: pytest -m "not slow and not azure" slow: marks tests as slow (deselect with '-m "not slow"') azure: marks as integration tests that require azure resource + io: marks integration tests involving some form of I/O operations (disk, internet, etc) testpaths = tests addopts =