Upgrade redact_cli_py to 0.3.2 (#1044)

* Upgrade redact_cli_py to 0.3.2

* Fix mailto typo, update document URL
This commit is contained in:
Chia-Sheng Chen 2022-08-24 17:07:22 +08:00 коммит произвёл GitHub
Родитель 2e5ef6f0ae
Коммит 801731cff2
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
48 изменённых файлов: 6517 добавлений и 557 удалений

Просмотреть файл

@ -0,0 +1,3 @@
[flake8]
max-line-length = 88
extend-ignore = E203, E501, PIE798

Просмотреть файл

@ -6,6 +6,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
## [0.3.2] - 2022-08-11
### Changed
- Refactor code styles with flake8/black and their extensions.
## [0.3.1] - 2022-08-02
### Added
- Support to multi page PDFs and TIFFs in batch redact CLI (`batch_redact.py`)
## [0.3.0] - 2022-01-06
### Added
- Support to FormRecognizer OCR Result v3.0 format while still maintaining the backward compatibility to v2.0 and v2.1.
### Changed
- The default API version of OCR result redaction has changed from v2.x to v3.x schema.
- You now need to specified which version of the OCR result you want to redact in `redact.py` and `batch_redact.py`.
- Before:
``` bash
python redact.py ocr <ocr_result_path> <fott_label_path> <output_path>
python batch_redact.py <input_container> <input_folder_path> <output_container> <output_folder_path>
```
- After:
``` bash
python redact.py ocr <ocr_result_path> <fott_label_path> <output_path> <api_version>
python batch_redact.py <input_container> <input_folder_path> <output_container> <output_folder_path> <api_version>
```
Where API Version is one of the following:
- v2.0
- v2.1
- v3.0
## [0.2.3] - 2021-12-13
### Added
- Support to redact some Latin ligature letters and letters with diacritics.

Просмотреть файл

@ -10,6 +10,12 @@ shapely = "*"
dacite = "*"
azure-storage-blob = "*"
pypdfium = "*"
flake8 = "*"
black = "*"
flake8-bugbear = "*"
flake8-pie = "*"
pep8-naming = "*"
flake8-black = "*"
[dev-packages]
pytest = "*"

622
scripts/redact_cli_py/Pipfile.lock сгенерированный
Просмотреть файл

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "6a2ce598371ced09c629f7844aa4c1172acbef11465108fc637f7e06958a1524"
"sha256": "7afbe6fd0e14f4c0b98d8ee3aa9e90e49b1250c72d796d8144fae0067f787d2a"
},
"pipfile-spec": 6,
"requires": {
@ -16,105 +16,187 @@
]
},
"default": {
"attrs": {
"hashes": [
"sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6",
"sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"
],
"markers": "python_version >= '3.5'",
"version": "==22.1.0"
},
"azure-core": {
"hashes": [
"sha256:25407390dde142d3e41ecf78bb18cedda9b7f7a0af558d082dec711c4a334f46",
"sha256:906e031a8241fe0794ec4137aca77a1aeab2ebde5cd6049c377d05cb6b87b691"
"sha256:0f3a20d245659bf81fb3670070a5410c8d4a43298d5a981e62dce393000a9084",
"sha256:a76856fa83efe1925a4fd917dc179c7daa15917dd71da2774833fa82a96f3dfa"
],
"version": "==1.17.0"
"markers": "python_version >= '3.6'",
"version": "==1.24.2"
},
"azure-storage-blob": {
"hashes": [
"sha256:e74c2c49fd04b80225f5b9734f1dbd417d89f280abfedccced3ac21509e1659d",
"sha256:eb37b50ddfb6e558b29f6c8c03b0666514e55d6170bf4624e7261a3af93c6401"
"sha256:280a6ab032845bab9627582bee78a50497ca2f14772929b5c5ee8b4605af0cb3",
"sha256:53f0d4cd32970ac9ff9b9753f83dd2fb3f9ac30e1d01e71638c436c509bfd884"
],
"index": "pypi",
"version": "==12.8.1"
"version": "==12.13.0"
},
"black": {
"hashes": [
"sha256:074458dc2f6e0d3dab7928d4417bb6957bb834434516f21514138437accdbe90",
"sha256:187d96c5e713f441a5829e77120c269b6514418f4513a390b0499b0987f2ff1c",
"sha256:2ea29072e954a4d55a2ff58971b83365eba5d3d357352a07a7a4df0d95f51c78",
"sha256:4af5bc0e1f96be5ae9bd7aaec219c901a94d6caa2484c21983d043371c733fc4",
"sha256:560558527e52ce8afba936fcce93a7411ab40c7d5fe8c2463e279e843c0328ee",
"sha256:568ac3c465b1c8b34b61cd7a4e349e93f91abf0f9371eda1cf87194663ab684e",
"sha256:6797f58943fceb1c461fb572edbe828d811e719c24e03375fd25170ada53825e",
"sha256:6c1734ab264b8f7929cef8ae5f900b85d579e6cbfde09d7387da8f04771b51c6",
"sha256:6c6d39e28aed379aec40da1c65434c77d75e65bb59a1e1c283de545fb4e7c6c9",
"sha256:7ba9be198ecca5031cd78745780d65a3f75a34b2ff9be5837045dce55db83d1c",
"sha256:94783f636bca89f11eb5d50437e8e17fbc6a929a628d82304c80fa9cd945f256",
"sha256:a218d7e5856f91d20f04e931b6f16d15356db1c846ee55f01bac297a705ca24f",
"sha256:a3db5b6409b96d9bd543323b23ef32a1a2b06416d525d27e0f67e74f1446c8f2",
"sha256:ac609cf8ef5e7115ddd07d85d988d074ed00e10fbc3445aee393e70164a2219c",
"sha256:b154e6bbde1e79ea3260c4b40c0b7b3109ffcdf7bc4ebf8859169a6af72cd70b",
"sha256:b270a168d69edb8b7ed32c193ef10fd27844e5c60852039599f9184460ce0807",
"sha256:b9fd45787ba8aa3f5e0a0a98920c1012c884622c6c920dbe98dbd05bc7c70fbf",
"sha256:c85928b9d5f83b23cee7d0efcb310172412fbf7cb9d9ce963bd67fd141781def",
"sha256:c9a3ac16efe9ec7d7381ddebcc022119794872abce99475345c5a61aa18c45ad",
"sha256:cfaf3895a9634e882bf9d2363fed5af8888802d670f58b279b0bece00e9a872d",
"sha256:e439798f819d49ba1c0bd9664427a05aab79bfba777a6db94fd4e56fae0cb849",
"sha256:f586c26118bc6e714ec58c09df0157fe2d9ee195c764f630eb0d8e7ccce72e69",
"sha256:f6fe02afde060bbeef044af7996f335fbe90b039ccf3f5eb8f16df8b20f77666"
],
"index": "pypi",
"version": "==22.6.0"
},
"certifi": {
"hashes": [
"sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
"sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
"sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"
],
"version": "==2021.5.30"
"markers": "python_version >= '3.6'",
"version": "==2022.6.15"
},
"cffi": {
"hashes": [
"sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d",
"sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771",
"sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872",
"sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c",
"sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc",
"sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762",
"sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202",
"sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5",
"sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548",
"sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a",
"sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f",
"sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20",
"sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218",
"sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c",
"sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e",
"sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56",
"sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224",
"sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a",
"sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2",
"sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a",
"sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819",
"sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346",
"sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b",
"sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e",
"sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534",
"sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb",
"sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0",
"sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156",
"sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd",
"sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87",
"sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc",
"sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195",
"sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33",
"sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f",
"sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d",
"sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd",
"sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728",
"sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7",
"sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca",
"sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99",
"sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf",
"sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e",
"sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c",
"sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5",
"sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69"
"sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5",
"sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef",
"sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104",
"sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426",
"sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405",
"sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375",
"sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a",
"sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e",
"sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc",
"sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf",
"sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185",
"sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497",
"sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3",
"sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35",
"sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c",
"sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83",
"sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21",
"sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca",
"sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984",
"sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac",
"sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd",
"sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee",
"sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a",
"sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2",
"sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192",
"sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7",
"sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585",
"sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f",
"sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e",
"sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27",
"sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b",
"sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e",
"sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e",
"sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d",
"sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c",
"sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415",
"sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82",
"sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02",
"sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314",
"sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325",
"sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c",
"sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3",
"sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914",
"sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045",
"sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d",
"sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9",
"sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5",
"sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2",
"sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c",
"sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3",
"sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2",
"sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8",
"sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d",
"sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d",
"sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9",
"sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162",
"sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76",
"sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4",
"sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e",
"sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9",
"sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6",
"sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b",
"sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01",
"sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"
],
"version": "==1.14.6"
"version": "==1.15.1"
},
"charset-normalizer": {
"hashes": [
"sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
"sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"
"sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5",
"sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"
],
"markers": "python_version >= '3'",
"version": "==2.0.4"
"markers": "python_version >= '3.6'",
"version": "==2.1.0"
},
"click": {
"hashes": [
"sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e",
"sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"
],
"markers": "python_version >= '3.7'",
"version": "==8.1.3"
},
"colorama": {
"hashes": [
"sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da",
"sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"
],
"markers": "platform_system == 'Windows'",
"version": "==0.4.5"
},
"cryptography": {
"hashes": [
"sha256:0f1212a66329c80d68aeeb39b8a16d54ef57071bf22ff4e521657b27372e327d",
"sha256:1e056c28420c072c5e3cb36e2b23ee55e260cb04eee08f702e0edfec3fb51959",
"sha256:240f5c21aef0b73f40bb9f78d2caff73186700bf1bc6b94285699aff98cc16c6",
"sha256:26965837447f9c82f1855e0bc8bc4fb910240b6e0d16a664bb722df3b5b06873",
"sha256:37340614f8a5d2fb9aeea67fd159bfe4f5f4ed535b1090ce8ec428b2f15a11f2",
"sha256:3d10de8116d25649631977cb37da6cbdd2d6fa0e0281d014a5b7d337255ca713",
"sha256:3d8427734c781ea5f1b41d6589c293089704d4759e34597dce91014ac125aad1",
"sha256:7ec5d3b029f5fa2b179325908b9cd93db28ab7b85bb6c1db56b10e0b54235177",
"sha256:8e56e16617872b0957d1c9742a3f94b43533447fd78321514abbe7db216aa250",
"sha256:b01fd6f2737816cb1e08ed4807ae194404790eac7ad030b34f2ce72b332f5586",
"sha256:bf40af59ca2465b24e54f671b2de2c59257ddc4f7e5706dbd6930e26823668d3",
"sha256:de4e5f7f68220d92b7637fc99847475b59154b7a1b3868fb7385337af54ac9ca",
"sha256:eb8cc2afe8b05acbd84a43905832ec78e7b3873fb124ca190f574dca7389a87d",
"sha256:ee77aa129f481be46f8d92a1a7db57269a2f23052d5f2433b4621bb457081cc9"
"sha256:190f82f3e87033821828f60787cfa42bff98404483577b591429ed99bed39d59",
"sha256:2be53f9f5505673eeda5f2736bea736c40f051a739bfae2f92d18aed1eb54596",
"sha256:30788e070800fec9bbcf9faa71ea6d8068f5136f60029759fd8c3efec3c9dcb3",
"sha256:3d41b965b3380f10e4611dbae366f6dc3cefc7c9ac4e8842a806b9672ae9add5",
"sha256:4c590ec31550a724ef893c50f9a97a0c14e9c851c85621c5650d699a7b88f7ab",
"sha256:549153378611c0cca1042f20fd9c5030d37a72f634c9326e225c9f666d472884",
"sha256:63f9c17c0e2474ccbebc9302ce2f07b55b3b3fcb211ded18a42d5764f5c10a82",
"sha256:6bc95ed67b6741b2607298f9ea4932ff157e570ef456ef7ff0ef4884a134cc4b",
"sha256:7099a8d55cd49b737ffc99c17de504f2257e3787e02abe6d1a6d136574873441",
"sha256:75976c217f10d48a8b5a8de3d70c454c249e4b91851f6838a4e48b8f41eb71aa",
"sha256:7bc997818309f56c0038a33b8da5c0bfbb3f1f067f315f9abd6fc07ad359398d",
"sha256:80f49023dd13ba35f7c34072fa17f604d2f19bf0989f292cedf7ab5770b87a0b",
"sha256:91ce48d35f4e3d3f1d83e29ef4a9267246e6a3be51864a5b7d2247d5086fa99a",
"sha256:a958c52505c8adf0d3822703078580d2c0456dd1d27fabfb6f76fe63d2971cd6",
"sha256:b62439d7cd1222f3da897e9a9fe53bbf5c104fff4d60893ad1355d4c14a24157",
"sha256:b7f8dd0d4c1f21759695c05a5ec8536c12f31611541f8904083f3dc582604280",
"sha256:d204833f3c8a33bbe11eda63a54b1aad7aa7456ed769a982f21ec599ba5fa282",
"sha256:e007f052ed10cc316df59bc90fbb7ff7950d7e2919c9757fd42a2b8ecf8a5f67",
"sha256:f2dcb0b3b63afb6df7fd94ec6fbddac81b5492513f7b0436210d390c14d46ee8",
"sha256:f721d1885ecae9078c3f6bbe8a88bc0786b6e749bf32ccec1ef2b18929a05046",
"sha256:f7a6de3e98771e183645181b3627e2563dcde3ce94a9e42a3f427d2255190327",
"sha256:f8c0a6e9e1dd3eb0414ba320f85da6b0dcbd543126e30fcc546e7372a7fbf3b9"
],
"markers": "python_version >= '3.6'",
"version": "==3.4.7"
"version": "==37.0.4"
},
"dacite": {
"hashes": [
@ -124,96 +206,202 @@
"index": "pypi",
"version": "==1.6.0"
},
"flake8": {
"hashes": [
"sha256:93aa565ae2f0316b95bb57a354f2b2d55ee8508e1fe1cb13b77b9c195b4a2537",
"sha256:b27fd7faa8d90aaae763664a489012292990388e5d3604f383b290caefbbc922"
],
"index": "pypi",
"version": "==5.0.3"
},
"flake8-black": {
"hashes": [
"sha256:7d667d0059fd1aa468de1669d77cc934b7f1feeac258d57bdae69a8e73c4cd90",
"sha256:8211f5e20e954cb57c709acccf2f3281ce27016d4c4b989c3e51f878bb7ce12a"
],
"index": "pypi",
"version": "==0.3.3"
},
"flake8-bugbear": {
"hashes": [
"sha256:db5d7a831ef4412a224b26c708967ff816818cabae415e76b8c58df156c4b8e5",
"sha256:e450976a07e4f9d6c043d4f72b17ec1baf717fe37f7997009c8ae58064f88305"
],
"index": "pypi",
"version": "==22.7.1"
},
"flake8-pie": {
"hashes": [
"sha256:47fd9d232b419f8db7a6465dee95cc24b385b1b8bdfd62b65250d70eaa06fc89",
"sha256:a2d1e67a374d925f688300e9d0e202d1827a0d91e0a11114f712beee639bdc7c"
],
"index": "pypi",
"version": "==0.15.0"
},
"idna": {
"hashes": [
"sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
"sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
"version": "==3.2"
"markers": "python_version >= '3.5'",
"version": "==3.3"
},
"isodate": {
"hashes": [
"sha256:2e364a3d5759479cdb2d37cce6b9376ea504db2ff90252a2e5b7cc89cc9ff2d8",
"sha256:aa4d33c06640f5352aca96e4b81afd8ab3b47337cc12089822d6f322ac772c81"
"sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96",
"sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"
],
"version": "==0.6.0"
"version": "==0.6.1"
},
"jsonpointer": {
"hashes": [
"sha256:150f80c5badd02c757da6644852f612f88e8b4bc2f9852dcbf557c8738919686",
"sha256:5a34b698db1eb79ceac454159d3f7c12a451a91f6334a4f638454327b7a89962"
"sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9",
"sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"
],
"index": "pypi",
"version": "==2.1"
"version": "==2.3"
},
"mccabe": {
"hashes": [
"sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325",
"sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"
],
"markers": "python_version >= '3.6'",
"version": "==0.7.0"
},
"msrest": {
"hashes": [
"sha256:72661bc7bedc2dc2040e8f170b6e9ef226ee6d3892e01affd4d26b06474d68d8",
"sha256:c840511c845330e96886011a236440fafc2c9aff7b2df9c0a92041ee2dee3782"
"sha256:21120a810e1233e5e6cc7fe40b474eeb4ec6f757a15d7cf86702c369f9567c32",
"sha256:6e7661f46f3afd88b75667b7187a92829924446c7ea1d169be8c4bb7eeb788b9"
],
"version": "==0.6.21"
"markers": "python_version >= '3.6'",
"version": "==0.7.1"
},
"mypy-extensions": {
"hashes": [
"sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
"sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
],
"version": "==0.4.3"
},
"oauthlib": {
"hashes": [
"sha256:42bf6354c2ed8c6acb54d971fce6f88193d97297e18602a3a886603f9d7730cc",
"sha256:8f0215fcc533dd8dd1bee6f4c412d4f0cd7297307d43ac61666389e3bc3198a3"
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
"sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"
],
"markers": "python_version >= '3.6'",
"version": "==3.1.1"
"version": "==3.2.0"
},
"pathspec": {
"hashes": [
"sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a",
"sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"
],
"version": "==0.9.0"
},
"pep8-naming": {
"hashes": [
"sha256:3af77cdaa9c7965f7c85a56cd579354553c9bbd3fdf3078a776f12db54dd6944",
"sha256:f7867c1a464fe769be4f972ef7b79d6df1d9aff1b1f04ecf738d471963d3ab9c"
],
"index": "pypi",
"version": "==0.13.1"
},
"pillow": {
"hashes": [
"sha256:0b2efa07f69dc395d95bb9ef3299f4ca29bcb2157dc615bae0b42c3c20668ffc",
"sha256:114f816e4f73f9ec06997b2fde81a92cbf0777c9e8f462005550eed6bae57e63",
"sha256:147bd9e71fb9dcf08357b4d530b5167941e222a6fd21f869c7911bac40b9994d",
"sha256:15a2808e269a1cf2131930183dcc0419bc77bb73eb54285dde2706ac9939fa8e",
"sha256:196560dba4da7a72c5e7085fccc5938ab4075fd37fe8b5468869724109812edd",
"sha256:1c03e24be975e2afe70dfc5da6f187eea0b49a68bb2b69db0f30a61b7031cee4",
"sha256:1fd5066cd343b5db88c048d971994e56b296868766e461b82fa4e22498f34d77",
"sha256:29c9569049d04aaacd690573a0398dbd8e0bf0255684fee512b413c2142ab723",
"sha256:2b6dfa068a8b6137da34a4936f5a816aba0ecc967af2feeb32c4393ddd671cba",
"sha256:2cac53839bfc5cece8fdbe7f084d5e3ee61e1303cccc86511d351adcb9e2c792",
"sha256:2ee77c14a0299d0541d26f3d8500bb57e081233e3fa915fa35abd02c51fa7fae",
"sha256:37730f6e68bdc6a3f02d2079c34c532330d206429f3cee651aab6b66839a9f0e",
"sha256:3f08bd8d785204149b5b33e3b5f0ebbfe2190ea58d1a051c578e29e39bfd2367",
"sha256:479ab11cbd69612acefa8286481f65c5dece2002ffaa4f9db62682379ca3bb77",
"sha256:4bc3c7ef940eeb200ca65bd83005eb3aae8083d47e8fcbf5f0943baa50726856",
"sha256:660a87085925c61a0dcc80efb967512ac34dbb256ff7dd2b9b4ee8dbdab58cf4",
"sha256:67b3666b544b953a2777cb3f5a922e991be73ab32635666ee72e05876b8a92de",
"sha256:70af7d222df0ff81a2da601fab42decb009dc721545ed78549cb96e3a1c5f0c8",
"sha256:75e09042a3b39e0ea61ce37e941221313d51a9c26b8e54e12b3ececccb71718a",
"sha256:8960a8a9f4598974e4c2aeb1bff9bdd5db03ee65fd1fce8adf3223721aa2a636",
"sha256:9364c81b252d8348e9cc0cb63e856b8f7c1b340caba6ee7a7a65c968312f7dab",
"sha256:969cc558cca859cadf24f890fc009e1bce7d7d0386ba7c0478641a60199adf79",
"sha256:9a211b663cf2314edbdb4cf897beeb5c9ee3810d1d53f0e423f06d6ebbf9cd5d",
"sha256:a17ca41f45cf78c2216ebfab03add7cc350c305c38ff34ef4eef66b7d76c5229",
"sha256:a2f381932dca2cf775811a008aa3027671ace723b7a38838045b1aee8669fdcf",
"sha256:a4eef1ff2d62676deabf076f963eda4da34b51bc0517c70239fafed1d5b51500",
"sha256:c088a000dfdd88c184cc7271bfac8c5b82d9efa8637cd2b68183771e3cf56f04",
"sha256:c0e0550a404c69aab1e04ae89cca3e2a042b56ab043f7f729d984bf73ed2a093",
"sha256:c11003197f908878164f0e6da15fce22373ac3fc320cda8c9d16e6bba105b844",
"sha256:c2a5ff58751670292b406b9f06e07ed1446a4b13ffced6b6cab75b857485cbc8",
"sha256:c35d09db702f4185ba22bb33ef1751ad49c266534339a5cebeb5159d364f6f82",
"sha256:c379425c2707078dfb6bfad2430728831d399dc95a7deeb92015eb4c92345eaf",
"sha256:cc866706d56bd3a7dbf8bac8660c6f6462f2f2b8a49add2ba617bc0c54473d83",
"sha256:d0da39795049a9afcaadec532e7b669b5ebbb2a9134576ebcc15dd5bdae33cc0",
"sha256:f156d6ecfc747ee111c167f8faf5f4953761b5e66e91a4e6767e548d0f80129c",
"sha256:f4ebde71785f8bceb39dcd1e7f06bcc5d5c3cf48b9f69ab52636309387b097c8",
"sha256:fc214a6b75d2e0ea7745488da7da3c381f41790812988c7a92345978414fad37",
"sha256:fd7eef578f5b2200d066db1b50c4aa66410786201669fb76d5238b007918fb24",
"sha256:ff04c373477723430dce2e9d024c708a047d44cf17166bf16e604b379bf0ca14"
"sha256:0030fdbd926fb85844b8b92e2f9449ba89607231d3dd597a21ae72dc7fe26927",
"sha256:030e3460861488e249731c3e7ab59b07c7853838ff3b8e16aac9561bb345da14",
"sha256:0ed2c4ef2451de908c90436d6e8092e13a43992f1860275b4d8082667fbb2ffc",
"sha256:136659638f61a251e8ed3b331fc6ccd124590eeff539de57c5f80ef3a9594e58",
"sha256:13b725463f32df1bfeacbf3dd197fb358ae8ebcd8c5548faa75126ea425ccb60",
"sha256:1536ad017a9f789430fb6b8be8bf99d2f214c76502becc196c6f2d9a75b01b76",
"sha256:15928f824870535c85dbf949c09d6ae7d3d6ac2d6efec80f3227f73eefba741c",
"sha256:17d4cafe22f050b46d983b71c707162d63d796a1235cdf8b9d7a112e97b15bac",
"sha256:1802f34298f5ba11d55e5bb09c31997dc0c6aed919658dfdf0198a2fe75d5490",
"sha256:1cc1d2451e8a3b4bfdb9caf745b58e6c7a77d2e469159b0d527a4554d73694d1",
"sha256:1fd6f5e3c0e4697fa7eb45b6e93996299f3feee73a3175fa451f49a74d092b9f",
"sha256:254164c57bab4b459f14c64e93df11eff5ded575192c294a0c49270f22c5d93d",
"sha256:2ad0d4df0f5ef2247e27fc790d5c9b5a0af8ade9ba340db4a73bb1a4a3e5fb4f",
"sha256:2c58b24e3a63efd22554c676d81b0e57f80e0a7d3a5874a7e14ce90ec40d3069",
"sha256:2d33a11f601213dcd5718109c09a52c2a1c893e7461f0be2d6febc2879ec2402",
"sha256:337a74fd2f291c607d220c793a8135273c4c2ab001b03e601c36766005f36885",
"sha256:37ff6b522a26d0538b753f0b4e8e164fdada12db6c6f00f62145d732d8a3152e",
"sha256:3d1f14f5f691f55e1b47f824ca4fdcb4b19b4323fe43cc7bb105988cad7496be",
"sha256:408673ed75594933714482501fe97e055a42996087eeca7e5d06e33218d05aa8",
"sha256:4134d3f1ba5f15027ff5c04296f13328fecd46921424084516bdb1b2548e66ff",
"sha256:4ad2f835e0ad81d1689f1b7e3fbac7b01bb8777d5a985c8962bedee0cc6d43da",
"sha256:50dff9cc21826d2977ef2d2a205504034e3a4563ca6f5db739b0d1026658e004",
"sha256:510cef4a3f401c246cfd8227b300828715dd055463cdca6176c2e4036df8bd4f",
"sha256:5aed7dde98403cd91d86a1115c78d8145c83078e864c1de1064f52e6feb61b20",
"sha256:69bd1a15d7ba3694631e00df8de65a8cb031911ca11f44929c97fe05eb9b6c1d",
"sha256:6bf088c1ce160f50ea40764f825ec9b72ed9da25346216b91361eef8ad1b8f8c",
"sha256:6e8c66f70fb539301e064f6478d7453e820d8a2c631da948a23384865cd95544",
"sha256:727dd1389bc5cb9827cbd1f9d40d2c2a1a0c9b32dd2261db522d22a604a6eec9",
"sha256:74a04183e6e64930b667d321524e3c5361094bb4af9083db5c301db64cd341f3",
"sha256:75e636fd3e0fb872693f23ccb8a5ff2cd578801251f3a4f6854c6a5d437d3c04",
"sha256:7761afe0126d046974a01e030ae7529ed0ca6a196de3ec6937c11df0df1bc91c",
"sha256:7888310f6214f19ab2b6df90f3f06afa3df7ef7355fc025e78a3044737fab1f5",
"sha256:7b0554af24df2bf96618dac71ddada02420f946be943b181108cac55a7a2dcd4",
"sha256:7c7b502bc34f6e32ba022b4a209638f9e097d7a9098104ae420eb8186217ebbb",
"sha256:808add66ea764ed97d44dda1ac4f2cfec4c1867d9efb16a33d158be79f32b8a4",
"sha256:831e648102c82f152e14c1a0938689dbb22480c548c8d4b8b248b3e50967b88c",
"sha256:93689632949aff41199090eff5474f3990b6823404e45d66a5d44304e9cdc467",
"sha256:96b5e6874431df16aee0c1ba237574cb6dff1dcb173798faa6a9d8b399a05d0e",
"sha256:9a54614049a18a2d6fe156e68e188da02a046a4a93cf24f373bffd977e943421",
"sha256:a138441e95562b3c078746a22f8fca8ff1c22c014f856278bdbdd89ca36cff1b",
"sha256:a647c0d4478b995c5e54615a2e5360ccedd2f85e70ab57fbe817ca613d5e63b8",
"sha256:a9c9bc489f8ab30906d7a85afac4b4944a572a7432e00698a7239f44a44e6efb",
"sha256:ad2277b185ebce47a63f4dc6302e30f05762b688f8dc3de55dbae4651872cdf3",
"sha256:b6d5e92df2b77665e07ddb2e4dbd6d644b78e4c0d2e9272a852627cdba0d75cf",
"sha256:bc431b065722a5ad1dfb4df354fb9333b7a582a5ee39a90e6ffff688d72f27a1",
"sha256:bdd0de2d64688ecae88dd8935012c4a72681e5df632af903a1dca8c5e7aa871a",
"sha256:c79698d4cd9318d9481d89a77e2d3fcaeff5486be641e60a4b49f3d2ecca4e28",
"sha256:cb6259196a589123d755380b65127ddc60f4c64b21fc3bb46ce3a6ea663659b0",
"sha256:d5b87da55a08acb586bad5c3aa3b86505f559b84f39035b233d5bf844b0834b1",
"sha256:dcd7b9c7139dc8258d164b55696ecd16c04607f1cc33ba7af86613881ffe4ac8",
"sha256:dfe4c1fedfde4e2fbc009d5ad420647f7730d719786388b7de0999bf32c0d9fd",
"sha256:ea98f633d45f7e815db648fd7ff0f19e328302ac36427343e4432c84432e7ff4",
"sha256:ec52c351b35ca269cb1f8069d610fc45c5bd38c3e91f9ab4cbbf0aebc136d9c8",
"sha256:eef7592281f7c174d3d6cbfbb7ee5984a671fcd77e3fc78e973d492e9bf0eb3f",
"sha256:f07f1f00e22b231dd3d9b9208692042e29792d6bd4f6639415d2f23158a80013",
"sha256:f3fac744f9b540148fa7715a435d2283b71f68bfb6d4aae24482a890aed18b59",
"sha256:fa768eff5f9f958270b081bb33581b4b569faabf8774726b283edb06617101dc",
"sha256:fac2d65901fb0fdf20363fbd345c01958a742f2dc62a8dd4495af66e3ff502a4"
],
"index": "pypi",
"version": "==8.3.1"
"version": "==9.2.0"
},
"platformdirs": {
"hashes": [
"sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788",
"sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"
],
"markers": "python_version >= '3.7'",
"version": "==2.5.2"
},
"pycodestyle": {
"hashes": [
"sha256:289cdc0969d589d90752582bef6dff57c5fbc6949ee8b013ad6d6449a8ae9437",
"sha256:beaba44501f89d785be791c9462553f06958a221d166c64e1f107320f839acc2"
],
"markers": "python_version >= '3.6'",
"version": "==2.9.0"
},
"pycparser": {
"hashes": [
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
"sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9",
"sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.20"
"version": "==2.21"
},
"pyflakes": {
"hashes": [
"sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2",
"sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"
],
"markers": "python_version >= '3.6'",
"version": "==2.5.0"
},
"pypdfium": {
"hashes": [
@ -224,48 +412,59 @@
},
"requests": {
"hashes": [
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
"sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
"sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
"sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.26.0"
"markers": "python_version >= '3.7' and python_version < '4'",
"version": "==2.28.1"
},
"requests-oauthlib": {
"hashes": [
"sha256:7f71572defaecd16372f9006f33c2ec8c077c3cfa6f5911a9a90202beb513f3d",
"sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a",
"sha256:fa6c47b933f01060936d87ae9327fead68768b69c6c9ea2109c48be30f2d4dbc"
"sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5",
"sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"
],
"version": "==1.3.0"
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.3.1"
},
"shapely": {
"hashes": [
"sha256:052eb5b9ba756808a7825e8a8020fb146ec489dd5c919e7d139014775411e688",
"sha256:1641724c1055459a7e2b8bbe47ba25bdc89554582e62aec23cb3f3ca25f9b129",
"sha256:17df66e87d0fe0193910aeaa938c99f0b04f67b430edb8adae01e7be557b141b",
"sha256:182716ffb500d114b5d1b75d7fd9d14b7d3414cef3c38c0490534cc9ce20981a",
"sha256:2df5260d0f2983309776cb41bfa85c464ec07018d88c0ecfca23d40bfadae2f1",
"sha256:35be1c5d869966569d3dfd4ec31832d7c780e9df760e1fe52131105685941891",
"sha256:46da0ea527da9cf9503e66c18bab6981c5556859e518fe71578b47126e54ca93",
"sha256:4c10f317e379cc404f8fc510cd9982d5d3e7ba13a9cfd39aa251d894c6366798",
"sha256:4f3c59f6dbf86a9fc293546de492f5e07344e045f9333f3a753f2dda903c45d1",
"sha256:60e5b2282619249dbe8dc5266d781cc7d7fb1b27fa49f8241f2167672ad26719",
"sha256:617bf046a6861d7c6b44d2d9cb9e2311548638e684c2cd071d8945f24a926263",
"sha256:6593026cd3f5daaea12bcc51ae5c979318070fefee210e7990cb8ac2364e79a1",
"sha256:6871acba8fbe744efa4f9f34e726d070bfbf9bffb356a8f6d64557846324232b",
"sha256:791477edb422692e7dc351c5ed6530eb0e949a31b45569946619a0d9cd5f53cb",
"sha256:8e7659dd994792a0aad8fb80439f59055a21163e236faf2f9823beb63a380e19",
"sha256:8f15b6ce67dcc05b61f19c689b60f3fe58550ba994290ff8332f711f5aaa9840",
"sha256:90a3e2ae0d6d7d50ff2370ba168fbd416a53e7d8448410758c5d6a5920646c1d",
"sha256:a3774516c8a83abfd1ddffb8b6ec1b0935d7fe6ea0ff5c31a18bfdae567b4eba",
"sha256:a5c3a50d823c192f32615a2a6920e8c046b09e07a58eba220407335a9cd2e8ea",
"sha256:b40cc7bb089ae4aa9ddba1db900b4cd1bce3925d2a4b5837b639e49de054784f",
"sha256:da38ed3d65b8091447dc3717e5218cc336d20303b77b0634b261bc5c1aa2bae8",
"sha256:de618e67b64a51a0768d26a9963ecd7d338a2cf6e9e7582d2385f88ad005b3d1",
"sha256:e3afccf0437edc108eef1e2bb9cc4c7073e7705924eb4cd0bf7715cd1ef0ce1b"
"sha256:0c0fd457ce477b1dced507a72f1e2084c9191bfcb8a1e09886990ebd02acf024",
"sha256:137f1369630408024a62ff79a437a5657e6c5b76b9cd352dde704b425acdb298",
"sha256:15a856fbb588ad5d042784e00918c662902776452008c771ecba2ff615cd197a",
"sha256:1d95842cc6bbbeab673061b63e70b07be9a375c15a60f4098f8fbd29f43af1b4",
"sha256:256bdf8080bb7bb504d47b2c76919ecebab9708cc1b26266b3ec32b42448f642",
"sha256:2e02da2e988e74d61f15c720f9f613fab51942aae2dfeacdcb78eadece00e1f3",
"sha256:3423299254deec075e79fb7dc7909d702104e4167149de7f45510c3a6342eeea",
"sha256:3a40bf497b57a6625b83996aed10ce2233bca0e5471b8af771b186d681433ac5",
"sha256:44d2832c1b706bf43101fda92831a083467cc4b4923a7ed17319ab599c1025d8",
"sha256:5254240eefc44139ab0d128faf671635d8bdd9c23955ee063d4d6b8f20073ae0",
"sha256:56413f7d32c70b63f239eb0865b24c0c61029e38757de456cc4ab3c416559a0b",
"sha256:572af9d5006fd5e3213e37ee548912b0341fb26724d6dc8a4e3950c10197ebb6",
"sha256:62056e64b12b6d483d79f8e34bf058d2fe734d51c9227c1713705399434eff3b",
"sha256:68c8e18dc9dc8a198c3addc8c9596f64137101f566f04b96ecfca0b214cb8b12",
"sha256:6bdc7728f1e5df430d8c588661f79f1eed4a2728c8b689e12707cfec217f68f8",
"sha256:6fcb28836ae93809de1dde73c03c9c24bab0ba2b2bf419ddb2aeb72c96d110e9",
"sha256:75042e8039c79dd01f102bb288beace9dc2f49fc44a2dea875f9b697aa8cd30d",
"sha256:78966332a89813b237de357a03f612fd451a871fe6e26c12b6b71645fe8eee39",
"sha256:7c8eda45085ccdd7f9805ea4a93fdd5eb0b6039a61d5f0cefb960487e6dc17a1",
"sha256:7c9e3400b716c51ba43eea1678c28272580114e009b6c78cdd00c44df3e325fa",
"sha256:840be3f27a1152851c54b968f2e12d718c9f13b7acd51c482e58a70f60f29e31",
"sha256:8e3ed52a081da58eb4a885c157c594876633dbd4eb283f13ba5bf39c82322d76",
"sha256:8fe641f1f61b3d43dd61b5a85d2ef023e6e19bf8f204a5160a1cb1ec645cbc09",
"sha256:a58e1f362f2091743e5e13212f5d5d16251a4bb63dd0ed587c652d3be9620d3a",
"sha256:a60861b5ca2c488ebcdc706eca94d325c26d1567921c74acc83df5e6913590c7",
"sha256:beee3949ddf381735049cfa6532fb234d5d20a5be910c4f2fb7c7295fd7960e3",
"sha256:c0a0d7752b145343838bd36ed09382d85f5befe426832d7384c5b051c147acbd",
"sha256:c60f3758212ec480675b820b13035dda8af8f7cc560d2cc67999b2717fb8faef",
"sha256:ce0b5c5f7acbccf98b3460eecaa40e9b18272b2a734f74fcddf1d7696e047e95",
"sha256:cec89a5617c0137f4678282e983c3d63bf838fb00cdf318cc555b4d8409f7130",
"sha256:d3f3fac625690f01f35af665649e993f15f924e740b5c0ac0376900655815521",
"sha256:d74de394684d66e25e780b0359fda85be7766af85940fa2dfad728b1a815c71f",
"sha256:e07b0bd2a0e61a8afd4d1c1bd23f3550b711f01274ffb53de99358fd781eefd8",
"sha256:f12695662c3ad1e6031b3de98f191963d0f09de6d1a4988acd907405644032ba"
],
"index": "pypi",
"version": "==1.7.1"
"version": "==1.8.2"
},
"six": {
"hashes": [
@ -275,39 +474,54 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0"
},
"tomli": {
"hashes": [
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
"markers": "python_full_version < '3.11.0a7'",
"version": "==2.0.1"
},
"typing-extensions": {
"hashes": [
"sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02",
"sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"
],
"markers": "python_version >= '3.7'",
"version": "==4.3.0"
},
"urllib3": {
"hashes": [
"sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
"sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
"sha256:c33ccba33c819596124764c23a97d25f32b28433ba0dedeb77d873a38722c9bc",
"sha256:ea6e8fb210b19d950fab93b60c9009226c63a28808bc8386e05301e25883ac0a"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.6"
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'",
"version": "==1.26.11"
}
},
"develop": {
"atomicwrites": {
"hashes": [
"sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197",
"sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"
"sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"
],
"markers": "sys_platform == 'win32'",
"version": "==1.4.0"
"version": "==1.4.1"
},
"attrs": {
"hashes": [
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
"sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
"sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6",
"sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.2.0"
"markers": "python_version >= '3.5'",
"version": "==22.1.0"
},
"colorama": {
"hashes": [
"sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b",
"sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"
"sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da",
"sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"
],
"markers": "sys_platform == 'win32'",
"version": "==0.4.4"
"markers": "platform_system == 'Windows'",
"version": "==0.4.5"
},
"iniconfig": {
"hashes": [
@ -318,51 +532,51 @@
},
"packaging": {
"hashes": [
"sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7",
"sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
"sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"
],
"markers": "python_version >= '3.6'",
"version": "==21.0"
"version": "==21.3"
},
"pluggy": {
"hashes": [
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.13.1"
"markers": "python_version >= '3.6'",
"version": "==1.0.0"
},
"py": {
"hashes": [
"sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
"sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
"sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
"sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.10.0"
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0"
},
"pyparsing": {
"hashes": [
"sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
"sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
"sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
"sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.4.7"
"markers": "python_full_version >= '3.6.8'",
"version": "==3.0.9"
},
"pytest": {
"hashes": [
"sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b",
"sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"
"sha256:13d0e3ccfc2b6e26be000cb6568c832ba67ba32e719443bfe725814d3c42433c",
"sha256:a06a0425453864a270bc45e71f783330a7428defb4230fb5e6a731fde06ecd45"
],
"index": "pypi",
"version": "==6.2.4"
"version": "==7.1.2"
},
"toml": {
"tomli": {
"hashes": [
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.10.2"
"markers": "python_full_version < '3.11.0a7'",
"version": "==2.0.1"
}
}
}

Просмотреть файл

@ -10,11 +10,11 @@ The OCR.json and labels.json will also be redacted while keeping the semantics o
![ocr-before-after-redaction](./images/ocr-before-after-redaction.png)
![labels-before-after-redaction](./images/labels-before-after-redaction.png)
## Language support
## Language Support
This tool supports Latin characters redaction only. For any non-Latin document support, please [contact us](mailto:formrecog_contact@microsoft.com?subject=Redaction%20tool%20language%20support).
## Version
Redact CLI 0.2.3
Redact CLI 0.3.2
## Setup Environment
@ -103,7 +103,21 @@ python redact.py image <image_path> <fott_label_path> <output_path>
### Redact OCR Result
``` bash
python redact.py ocr <ocr_result_path> <fott_label_path> <output_path>
python redact.py ocr <ocr_result_path> <fott_label_path> <output_path> <api_version>
```
#### API Version
In Azure Form Recognizer, The OCR result for different API version has different schema. To successfully redact the OCR result, you must give one of the `<api_version>` to the redaction toolkit.
- v2.0
- v2.1
- v3.0
For example,
``` bash
python redact.py ocr sample.ocr.json sample.labels.json redacted_sample.ocr.json "v3.0"
```
### Redact FOTT Label Path
@ -113,6 +127,7 @@ python redact.py fott <fott_label_path> <output_path>
```
### Redact specific labels from Image, OCR results or FOTT Label Path
In some specific use-cases, the need may arise to redact specific labels from an image, OCR results or/and FOTT Label Path.
Labels to be redacted need to provided together in a string separated by commas.
@ -127,17 +142,17 @@ And _Label_01_ and _Label_04_ need to be redacted, the following commands can be
#### Redact specific labels from Image
``` bash
python redact.py image <fott_label_path> <output_path> "Label_01,Label_04"
python redact.py image <fott_label_path> <output_path> <api_version> "Label_01,Label_04"
```
#### Redact specific labels from OCR Result
``` bash
python redact.py ocr <ocr_result_path> <image_path> <fott_label_path> <output_path> "Label_01,Label_04"
python redact.py ocr <ocr_result_path> <image_path> <fott_label_path> <output_path> <api_version> "Label_01,Label_04"
```
#### Redact specific labels from FOTT Label Path
``` bash
python redact.py image <image_path> <fott_label_path> <output_path> "Label_01,Label_04"
python redact.py image <image_path> <fott_label_path> <output_path> <api_version> "Label_01,Label_04"
```
### Batch Redaction
@ -146,7 +161,7 @@ Batch redaction supports redacting a folder rather than executing on a single fi
2. Azure Blob Storage virtual folder: a URL to a Blob Storage container and a folder path to denotes the folder.
``` bash
python batch_redact.py <input_container> <input_folder_path> <output_container> <output_folder_path>
python batch_redact.py <input_container> <input_folder_path> <output_container> <output_folder_path> <api_version>
```
#### Container
@ -176,12 +191,16 @@ python batch_redact.py local raw/ "https://my.blob.account/data?<my_secret_SAS_t
python batch_redact.py "https://my.blob.account/data?<my_secret_SAS_token>" folder1/ "https://my.blob.account/data?<my_secret_SAS_token>" folder2/
```
#### Note
---
**NOTE**
1. Surround the URL with double quotes to prevent wrong character escape in the SAS token.
2. Visit [Create Your SAS tokens with Azure Storage Explorer](https://docs.microsoft.com/en-us/azure/cognitive-services/translator/document-translation/create-sas-tokens?tabs=Containers) to see how to create a SAS token for this program to use.
3. Currently, this redact CLI only support ASCII character redaction (Latin alphabets without the accent marks).
---
#### PDF Support
Batch mode now supports redacting data from one-page PDF documents. The tool will detect any PDF document in the input folder, convert to an image (.png) and redact the image itself placing it in the specified output folder upon completion.
@ -204,7 +223,17 @@ pytest
in the root folder.
### Note
---
**NOTE**
1. You can also take a look at the `redact/__init__.py` file. The command line interface (CLI) is just a thin wrapper on `redact_image()`, `redact_ocr_result()`, and `redact_fott_label()`. You could extend the code on top of the three functions for achieving your own goal, such as to redact a batch of data.
2. For batch redaction, we currently only support `.jpeg`, `.jpg`, `.png`, `.tif`, `.tiff`, and `.bmp` as the file extension for images. PDF files are not supported.
---
## References
- [Form Recognizer API v2.0](https://westus2.dev.cognitive.microsoft.com/docs/services/form-recognizer-api-v2/operations/AnalyzeWithCustomForm)
- [Form Recognizer API v2.1](https://westus.dev.cognitive.microsoft.com/docs/services/form-recognizer-api-v2-1/operations/AnalyzeWithCustomForm)
- [Form Recognizer API v3.0](https://westus.dev.cognitive.microsoft.com/docs/services/form-recognizer-api-2022-08-31/operations/GetAnalyzeDocumentResult)

Просмотреть файл

@ -8,15 +8,14 @@ import shutil
from typing import List
from uuid import uuid4
from redact import redact_image, redact_fott_label, redact_ocr_result
from redact import redact_fott_label, redact_ocr_result, redact_file_bundle
from redact.io.blob_reader import BlobReader
from redact.io.blob_writer import BlobWriter
from redact.io.local_reader import LocalReader
from redact.io.local_writer import LocalWriter
from redact.utils.file_name import get_redacted_file_name, valid_url
from redact.utils.pdf_renderer import PdfRenderer
from redact.types.file_bundle import FileType, FileBundle
from redact.types.pre_processing_bundle import PdfPreProcessingBundle
from redact.preprocess import preprocess_multi_page_bundle
# Strong Assumption: assume all valid URLs are Azure Blob URL.
@ -24,92 +23,82 @@ def is_blob_url(url: str) -> bool:
return valid_url(url)
def process_pdf_bundle(file_bundles: List[FileBundle], fields_to_redact: List[str]):
renderer = PdfRenderer()
for file_bundle in file_bundles:
pdf_pre_processing_bundle = PdfPreProcessingBundle.from_file_bundle(file_bundle)
redacted_image_name = get_redacted_file_name(pdf_pre_processing_bundle.rendered_file_name)
redacted_fott_name = get_redacted_file_name(file_bundle.fott_file_name)
redacted_ocr_name = get_redacted_file_name(file_bundle.ocr_file_name)
# Render PDF
renderer.render_pdf_and_save(
Path(build_pre_processing_folder, file_bundle.image_file_name),
Path(build_pre_processing_folder, pdf_pre_processing_bundle.rendered_file_name),
target_pdf_render_dpi)
# Follow the regular redaction process with taking files from slightly different source folders
redact_image(
Path(build_pre_processing_folder, pdf_pre_processing_bundle.rendered_file_name),
Path(build_pre_processing_folder, file_bundle.fott_file_name),
Path(build_output_folder, redacted_image_name),
fields_to_redact)
redact_fott_label(
Path(build_pre_processing_folder, file_bundle.fott_file_name),
Path(build_output_folder, redacted_fott_name),
fields_to_redact)
redact_ocr_result(
Path(build_pre_processing_folder, file_bundle.ocr_file_name),
Path(build_pre_processing_folder, file_bundle.fott_file_name),
Path(build_output_folder, redacted_ocr_name),
fields_to_redact)
if __name__ == '__main__':
if __name__ == "__main__":
input_container = sys.argv[1]
input_path = sys.argv[2]
output_container = sys.argv[3]
output_path = sys.argv[4]
api_version = sys.argv[5]
target_pdf_render_dpi = 300
fields_to_redact = []
fields_to_redact = tuple()
if len(sys.argv) >= 6:
fields_to_redact = (sys.argv[5].split(','))
if len(sys.argv) >= 7:
fields_to_redact = sys.argv[6].split(",")
# Random generated UUID in the build folder name for preventing collapse.
build_path = Path(f'build-{uuid4()}/')
build_pre_processing_folder = Path(build_path, "pre/")
build_path = Path(f"build-{uuid4()}/")
build_pre_folder = Path(build_path, "pre/")
build_input_folder = Path(build_path, "in/")
build_output_folder = Path(build_path, "out/")
Path(build_pre_processing_folder).mkdir(parents=True, exist_ok=True)
Path(build_pre_folder).mkdir(parents=True, exist_ok=True)
Path(build_input_folder).mkdir(parents=True, exist_ok=True)
Path(build_output_folder).mkdir(parents=True, exist_ok=True)
try:
file_bundle_list = None
pdf_file_bundle_list = None
multi_page_bundle_list = None
if is_blob_url(input_container):
reader = BlobReader(input_container, input_path)
pdf_file_bundle_list = reader.download_bundles(to=build_pre_processing_folder, mode=FileType.PDF_ONLY)
multi_page_bundle_list = reader.download_bundles(
to=build_pre_folder, mode=FileType.MULTI_PAGE
)
file_bundle_list = reader.download_bundles(to=build_input_folder)
else:
reader = LocalReader(input_path)
pdf_file_bundle_list = reader.copy_bundles(to=build_pre_processing_folder, mode=FileType.PDF_ONLY)
multi_page_bundle_list = reader.copy_bundles(
to=build_pre_folder, mode=FileType.MULTI_PAGE
)
file_bundle_list = reader.copy_bundles(to=build_input_folder)
per_page_bundle_list: List[FileBundle] = []
# Render and process PDF/TIFF files if any.
if multi_page_bundle_list is not None:
for fb in multi_page_bundle_list:
bundle_list = preprocess_multi_page_bundle(
fb, build_pre_folder, build_input_folder, target_pdf_render_dpi
)
per_page_bundle_list.extend(bundle_list)
# Short path: preprocess folder -> output folder.
# We still need to redact the full label file.
redact_fott_label(
Path(build_pre_folder, fb.fott_file_name),
Path(
build_output_folder, get_redacted_file_name(fb.fott_file_name)
),
fields_to_redact,
)
# We still need to redact the full ocr file.
redact_ocr_result(
Path(build_pre_folder, fb.ocr_file_name),
Path(build_pre_folder, fb.fott_file_name),
Path(build_output_folder, get_redacted_file_name(fb.ocr_file_name)),
api_version,
fields_to_redact,
)
# Process images and per page result from multi-page documents.
file_bundle_list.extend(per_page_bundle_list)
for fb in file_bundle_list:
redacted_image_name = get_redacted_file_name(fb.image_file_name)
redacted_fott_name = get_redacted_file_name(fb.fott_file_name)
redacted_ocr_name = get_redacted_file_name(fb.ocr_file_name)
redact_image(
Path(build_input_folder, fb.image_file_name),
Path(build_input_folder, fb.fott_file_name),
Path(build_output_folder, redacted_image_name),
fields_to_redact)
redact_fott_label(
Path(build_input_folder, fb.fott_file_name),
Path(build_output_folder, redacted_fott_name),
fields_to_redact)
redact_ocr_result(
Path(build_input_folder, fb.ocr_file_name),
Path(build_input_folder, fb.fott_file_name),
Path(build_output_folder, redacted_ocr_name),
fields_to_redact)
# Render and process PDF files if any
if pdf_file_bundle_list is not None:
process_pdf_bundle(pdf_file_bundle_list, fields_to_redact)
redact_file_bundle(
fb,
build_input_folder,
build_output_folder,
api_version,
fields_to_redact,
)
if is_blob_url(output_container):
writer = BlobWriter(output_container, output_path)

Просмотреть файл

@ -6,30 +6,35 @@ import sys
from redact import redact_image, redact_fott_label, redact_ocr_result
if __name__ == '__main__':
if __name__ == "__main__":
operator = sys.argv[1]
if operator == 'image':
labels_to_redact = [] if len(sys.argv) < 6 else sys.argv[5].split(',')
if operator == "image":
labels_to_redact = [] if len(sys.argv) < 6 else sys.argv[5].split(",")
redact_image(
image_path=sys.argv[2],
fott_label_path=sys.argv[3],
output_path=sys.argv[4],
labels_to_redact=labels_to_redact)
labels_to_redact=labels_to_redact,
)
elif operator == 'fott':
labels_to_redact = [] if len(sys.argv) < 5 else sys.argv[4].split(',')
redact_fott_label(fott_label_path=sys.argv[2],
output_path=sys.argv[3],
labels_to_redact=labels_to_redact)
elif operator == "fott":
labels_to_redact = [] if len(sys.argv) < 5 else sys.argv[4].split(",")
redact_fott_label(
fott_label_path=sys.argv[2],
output_path=sys.argv[3],
labels_to_redact=labels_to_redact,
)
elif operator == 'ocr':
labels_to_redact = [] if len(sys.argv) < 6 else sys.argv[5].split(',')
elif operator == "ocr":
labels_to_redact = [] if len(sys.argv) < 7 else sys.argv[6].split(",")
redact_ocr_result(
ocr_result_path=sys.argv[2],
fott_label_path=sys.argv[3],
output_path=sys.argv[4],
labels_to_redact=labels_to_redact)
api_version=sys.argv[5],
labels_to_redact=labels_to_redact,
)
else:
raise NameError()

Просмотреть файл

@ -4,7 +4,7 @@
from pathlib import Path
import json
from typing import List
from typing import List, Collection
from PIL import Image, ImageOps
from dacite import from_dict
@ -12,12 +12,21 @@ from dacite import from_dict
from redact.redaction.image_redaction import ImageRedaction
from redact.redaction.ocr_result_redaction import OcrResultRedaction
from redact.redaction.fott_label_redaction import FottLabelRedaction
from redact.types.api_version import ApiVersion
from redact.types.fott_label import FottLabel
from redact.types.file_bundle import FileBundle
from redact.utils.file_name import get_redacted_file_name
def redact_image(image_path: str, fott_label_path: str, output_path: str, labels_to_redact: List[str] = []):
with Image.open(image_path) as image, \
open(fott_label_path, encoding='utf-8-sig') as fott_label_json:
def redact_image(
image_path: str,
fott_label_path: str,
output_path: str,
labels_to_redact: Collection[str] = tuple(),
):
with Image.open(image_path) as image, open(
fott_label_path, encoding="utf-8-sig"
) as fott_label_json:
# Transpose the image based on EXIF orientation tag.
image = ImageOps.exif_transpose(image)
@ -26,17 +35,22 @@ def redact_image(image_path: str, fott_label_path: str, output_path: str, labels
fott_label = from_dict(data_class=FottLabel, data=fott_label_dict)
# page_size = {page: (width, height)}
annots = fott_label.to_annotations(
page_size={1: (image.width, image.height)})
annots = fott_label.to_annotations(page_size={1: (image.width, image.height)})
redaction = ImageRedaction(image=image, annotations=annots, labels_to_redact=labels_to_redact)
redaction = ImageRedaction(
image=image, annotations=annots, labels_to_redact=labels_to_redact
)
redaction.redact()
redaction.image.save(output_path)
def redact_fott_label(fott_label_path: str, output_path: str, labels_to_redact: List[str] = []):
with open(fott_label_path, encoding='utf-8-sig') as fott_label_json:
def redact_fott_label(
fott_label_path: str,
output_path: str,
labels_to_redact: Collection[str] = tuple(),
):
with open(fott_label_path, encoding="utf-8-sig") as fott_label_json:
fott_label_dict = json.load(fott_label_json)
fott_label = from_dict(data_class=FottLabel, data=fott_label_dict)
@ -48,20 +62,24 @@ def redact_fott_label(fott_label_path: str, output_path: str, labels_to_redact:
def dumper(obj):
try:
return obj.toJSON()
except:
except AttributeError:
return obj.__dict__
Path(output_path).write_text(
json.dumps(redaction.fott_label, default=dumper), encoding='utf-8')
json.dumps(redaction.fott_label, default=dumper), encoding="utf-8"
)
def redact_ocr_result(
ocr_result_path: str,
fott_label_path: str,
output_path: str,
labels_to_redact: List[str] = []):
with open(ocr_result_path, encoding='utf-8-sig') as ocr_result_json, \
open(fott_label_path, encoding='utf-8-sig') as fott_label_json:
ocr_result_path: str,
fott_label_path: str,
output_path: str,
api_version: ApiVersion,
labels_to_redact: Collection[str] = tuple(),
):
with open(ocr_result_path, encoding="utf-8-sig") as ocr_result_json, open(
fott_label_path, encoding="utf-8-sig"
) as fott_label_json:
fott_label_dict = json.load(fott_label_json)
fott_label = from_dict(data_class=FottLabel, data=fott_label_dict)
@ -69,14 +87,63 @@ def redact_ocr_result(
# page_size = {page: (width, height)}
page_size = {}
for readResult in ocr_result["analyzeResult"]["readResults"]:
page_size[readResult["page"]] = (
readResult["width"], readResult["height"])
if ApiVersion(api_version) in [
ApiVersion.V2_0,
ApiVersion.V2_1,
]:
for read_result in ocr_result["analyzeResult"]["readResults"]:
page_size[read_result["page"]] = (
read_result["width"],
read_result["height"],
)
elif ApiVersion(api_version) in [
ApiVersion.V3_0,
]:
pages = ocr_result["analyzeResult"]["pages"]
for page in pages:
page_number = page["pageNumber"]
page_size[page_number] = (page["width"], page["height"])
annots = fott_label.to_annotations(page_size=page_size)
redaction = OcrResultRedaction(ocr_result, annots, labels_to_redact)
redaction = OcrResultRedaction(
ocr_result,
annots,
api_version,
labels_to_redact,
)
redaction.redact()
Path(output_path).write_text(
json.dumps(redaction.ocr_result), encoding='utf-8')
Path(output_path).write_text(json.dumps(redaction.ocr_result), encoding="utf-8")
def redact_file_bundle(
fb: FileBundle,
in_folder: str,
out_folder: str,
api_version: ApiVersion,
labels_to_redact: Collection[str] = tuple(),
):
redacted_image_name = get_redacted_file_name(fb.image_file_name)
redacted_fott_name = get_redacted_file_name(fb.fott_file_name)
redacted_ocr_name = get_redacted_file_name(fb.ocr_file_name)
redact_image(
Path(in_folder, fb.image_file_name),
Path(in_folder, fb.fott_file_name),
Path(out_folder, redacted_image_name),
labels_to_redact=labels_to_redact,
)
redact_fott_label(
Path(in_folder, fb.fott_file_name),
Path(out_folder, redacted_fott_name),
labels_to_redact,
)
redact_ocr_result(
Path(in_folder, fb.ocr_file_name),
Path(in_folder, fb.fott_file_name),
Path(out_folder, redacted_ocr_name),
api_version,
labels_to_redact,
)

Просмотреть файл

@ -11,13 +11,14 @@ from redact.types.file_bundle import FileBundle
from redact.types.file_bundle import FileType
class BlobReader():
class BlobReader:
def __init__(self, container_url: str, prefix: str):
self.container_client = ContainerClient.from_container_url(
container_url)
self.container_client = ContainerClient.from_container_url(container_url)
self.prefix = prefix
def download_bundles(self, to: str, mode=FileType.IMAGE_ONLY) -> List[FileBundle]:
def download_bundles(
self, to: str, mode=FileType.SINGLE_PAGE_IMAGE
) -> List[FileBundle]:
blobs = self.container_client.list_blobs(name_starts_with=self.prefix)
all_file_name_list = [Path(blob.name).name for blob in blobs]
file_bundles = FileBundle.from_names(all_file_name_list, mode)
@ -31,18 +32,18 @@ class BlobReader():
fott_path = Path(to, bundle.fott_file_name)
ocr_path = Path(to, bundle.ocr_file_name)
with open(image_path, 'wb') as image_file, \
open(fott_path, 'wb') as fott_file, \
open(ocr_path, 'wb') as ocr_file:
with open(image_path, "wb") as image_file, open(
fott_path, "wb"
) as fott_file, open(ocr_path, "wb") as ocr_file:
image_file.write(
self.container_client.
download_blob(image_blob_path).readall())
self.container_client.download_blob(image_blob_path).readall()
)
fott_file.write(
self.container_client.
download_blob(fott_blob_path).readall())
self.container_client.download_blob(fott_blob_path).readall()
)
ocr_file.write(
self.container_client.
download_blob(ocr_blob_path).readall())
self.container_client.download_blob(ocr_blob_path).readall()
)
return file_bundles

Просмотреть файл

@ -7,16 +7,14 @@ from pathlib import Path
from azure.storage.blob import ContainerClient
class BlobWriter():
class BlobWriter:
def __init__(self, container_url: str, prefix: str):
self.container_client = ContainerClient.from_container_url(
container_url)
self.container_client = ContainerClient.from_container_url(container_url)
self.prefix = prefix
def upload_files(self, folder: str):
for child in Path(folder).iterdir():
with open(child, "rb") as data:
self.container_client.upload_blob(
name=self.prefix + child.name,
data=data,
overwrite=True)
name=self.prefix + child.name, data=data, overwrite=True
)

Просмотреть файл

@ -10,12 +10,14 @@ from redact.types.file_bundle import FileBundle
from redact.types.file_bundle import FileType
class LocalReader():
class LocalReader:
def __init__(self, input_path: str):
self.input_path = Path(input_path)
def copy_bundles(self, to: str, mode=FileType.IMAGE_ONLY) -> List[FileBundle]:
file_names = [path.name for path in self.input_path.glob('**/*')]
def copy_bundles(
self, to: str, mode=FileType.SINGLE_PAGE_IMAGE
) -> List[FileBundle]:
file_names = [path.name for path in self.input_path.glob("**/*")]
file_bundles = FileBundle.from_names(file_names, mode)
for bundle in file_bundles:

Просмотреть файл

@ -6,7 +6,7 @@ from pathlib import Path
import shutil
class LocalWriter():
class LocalWriter:
def __init__(self, output_path: str):
self.output_path = Path(output_path)
Path(self.output_path).mkdir(parents=True, exist_ok=True)

Просмотреть файл

@ -0,0 +1,86 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from pathlib import Path
from typing import List
from redact.types.file_bundle import FileBundle
from redact.utils.file_name import get_page_file_name, is_pdf, is_tiff
from redact.preprocess.pdf_renderer import PdfRenderer
from redact.preprocess.tiff_renderer import TiffRenderer
from redact.preprocess.multi_page import extract_page_label, extract_page_ocr
def preprocess_multi_page_bundle(
fb: FileBundle,
pre_folder: str,
in_folder: str,
target_pdf_render_dpi: int = 300,
) -> List[FileBundle]:
if is_pdf(fb.image_file_name):
renderer = PdfRenderer()
elif is_tiff(fb.image_file_name):
renderer = TiffRenderer()
else:
raise ValueError("File should be PDF or TIFF.")
ret = []
page_count = renderer.get_page_count(
Path(pre_folder, fb.image_file_name),
)
for page in range(1, page_count + 1):
# Render raw image per page.
page_image_name = get_page_file_name(
fb.image_file_name,
page,
".rendered.png",
)
if is_pdf(fb.image_file_name):
renderer.render_pdf_and_save(
Path(pre_folder, fb.image_file_name),
Path(in_folder, page_image_name),
target_pdf_render_dpi,
page_number=page,
)
elif is_tiff(fb.image_file_name):
renderer.render_tiff_and_save(
Path(pre_folder, fb.image_file_name),
Path(in_folder, page_image_name),
page_number=page,
)
else:
raise ValueError("File should be PDF or TIFF.")
# Extract raw FOTT file per page.
page_fott_file_name = get_page_file_name(
fb.image_file_name,
page,
".rendered.png.labels.json",
)
extract_page_label(
Path(pre_folder, fb.fott_file_name),
Path(in_folder, page_fott_file_name),
page,
)
# Extract raw OCR file per page.
page_ocr_file_name = get_page_file_name(
fb.image_file_name,
page,
".rendered.png.ocr.json",
)
extract_page_ocr(
Path(pre_folder, fb.ocr_file_name),
Path(in_folder, page_ocr_file_name),
page,
)
ret.append(
FileBundle(
image_file_name=page_image_name,
fott_file_name=page_fott_file_name,
ocr_file_name=page_ocr_file_name,
)
)
return ret

Просмотреть файл

@ -0,0 +1,48 @@
from pathlib import Path
import json
from dacite import from_dict
from redact.types.fott_label import FottLabel
def extract_page_label(fott_label_path: str, output_path: str, page_number: int):
with open(fott_label_path, encoding="utf-8-sig") as fott_label_json:
fott_label_dict = json.load(fott_label_json)
fott_label = from_dict(data_class=FottLabel, data=fott_label_dict)
selected_labels = []
for label in fott_label.labels:
selected_entities = []
for entity in label.value:
if entity.page == page_number:
entity.page = 1
selected_entities.append(entity)
if len(selected_entities) > 0:
selected_labels.append(label)
fott_label.labels = selected_labels
# Custom dumper because default JSON serializer
# does not support FottLabel.
def dumper(obj):
try:
return obj.toJSON()
except AttributeError:
return obj.__dict__
Path(output_path).write_text(
json.dumps(fott_label, default=dumper), encoding="utf-8"
)
def extract_page_ocr(ocr_result_path: str, output_path: str, page_number: int):
with open(ocr_result_path, encoding="utf-8-sig") as ocr_result_json:
ocr_result = json.load(ocr_result_json)
new_read_results = [ocr_result["analyzeResult"]["readResults"][page_number - 1]]
new_read_results[0]["page"] = 1
ocr_result["analyzeResult"]["readResults"] = new_read_results
Path(output_path).write_text(json.dumps(ocr_result), encoding="utf-8")

Просмотреть файл

@ -0,0 +1,90 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from PIL import Image
import ctypes
import pypdfium as pdfium
WHITE = 0xFFFFFFFF
class PdfRenderer:
def __init__(self):
# Initiate PDFium - This only needs to happen once
pdfium.FPDF_InitLibraryWithConfig(pdfium.FPDF_LIBRARY_CONFIG(2, None, None, 0))
def get_page_count(self, input_file: str):
doc = pdfium.FPDF_LoadDocument(str(input_file), None)
page_count = pdfium.FPDF_GetPageCount(doc)
pdfium.FPDF_CloseDocument(doc)
return page_count
def render_pdf(
self, input_file: str, render_target_dpi: int, page_number: int = 1
) -> Image:
"""
This renders a PDF page into an Image.
:param input_file: a path points to the PDF.
:param render_target_dpi: the target DPI for rendering the image.
:param page_number: an **1-based** page index for the to-be-rendered page.
:returns: an Image of the PDF page with the target DPI.
"""
doc = pdfium.FPDF_LoadDocument(str(input_file), None)
page = pdfium.FPDF_LoadPage(doc, page_number - 1)
# Page dimensions are measured in points. One point is 1/72 inch (around 0.3528 mm).
width = int(pdfium.FPDF_GetPageWidthF(page) + 0.5)
height = int(pdfium.FPDF_GetPageHeightF(page) + 0.5)
# Converting to page
render_width = int(width / 72 * render_target_dpi)
render_height = int(height / 72 * render_target_dpi)
# render to bitmap
bitmap = pdfium.FPDFBitmap_Create(render_width, render_height, 0)
pdfium.FPDFBitmap_FillRect(
bitmap, 0, 0, render_width, render_height, 0xFFFFFFFF
)
pdfium.FPDF_RenderPageBitmap(
bitmap,
page,
0,
0,
render_width,
render_height,
0,
pdfium.FPDF_LCD_TEXT | pdfium.FPDF_ANNOT,
)
# retrieve data from bitmap
buffer = pdfium.FPDFBitmap_GetBuffer(bitmap)
buffer_ = ctypes.cast(
buffer, ctypes.POINTER(ctypes.c_ubyte * (render_width * render_height * 4))
)
img = Image.frombuffer(
"RGBA", (render_width, render_height), buffer_.contents, "raw", "BGRA", 0, 1
)
if bitmap is not None:
pdfium.FPDFBitmap_Destroy(bitmap)
pdfium.FPDF_ClosePage(page)
pdfium.FPDF_CloseDocument(doc)
return img
def render_pdf_and_save(
self,
input_file: str,
output_file: str,
render_target_dpi: int,
page_number: int = 1,
):
img = self.render_pdf(input_file, render_target_dpi, page_number)
img.save(output_file)
img.close()

Просмотреть файл

@ -0,0 +1,23 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from PIL import Image
class TiffRenderer:
def get_page_count(self, input_file: str):
tiffstack = Image.open(input_file)
tiffstack.load()
return tiffstack.n_frames
def render_tiff_and_save(
self, input_file: str, output_file: str, page_number: int = 1
):
tiffstack = Image.open(input_file)
tiffstack.load()
tiffstack.seek(page_number - 1)
tiffstack.save(output_file)
tiffstack.close()

Просмотреть файл

@ -1,14 +1,18 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from typing import List
from typing import Collection
from redact.types.fott_label import FottLabel
from redact.utils.redact_policy import first_char
class FottLabelRedaction:
def __init__(self, fott_label: FottLabel, labels_to_redact: List[str] = []):
def __init__(
self,
fott_label: FottLabel,
labels_to_redact: Collection[str] = tuple(),
):
self.fott_label = fott_label
self.labels_to_redact = labels_to_redact

Просмотреть файл

@ -2,7 +2,7 @@
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from typing import List
from typing import List, Collection
from PIL import Image, ImageDraw
@ -16,7 +16,12 @@ class ImageRedaction:
COLOR = "#FFFFFF"
COLOR_WITH_ALPHA = "#FFFFFFFF"
def __init__(self, image: Image, annotations: List[Annotation], labels_to_redact: List[str] = []):
def __init__(
self,
image: Image,
annotations: List[Annotation],
labels_to_redact: Collection[str] = tuple(),
):
self.image = image
self.anntations = annotations
self.labels_to_redact = labels_to_redact
@ -24,14 +29,20 @@ class ImageRedaction:
def redact(self):
draw = ImageDraw.Draw(self.image)
for annotation in self.anntations:
if len(self.labels_to_redact) == 0 or annotation.field in self.labels_to_redact:
if (
len(self.labels_to_redact) == 0
or annotation.field in self.labels_to_redact
):
if self.with_alpha_channel(self.image.mode):
draw.polygon(annotation.bounding_box,
fill=self.COLOR_WITH_ALPHA,
outline=self.COLOR_WITH_ALPHA)
draw.polygon(
annotation.bounding_box,
fill=self.COLOR_WITH_ALPHA,
outline=self.COLOR_WITH_ALPHA,
)
else:
draw.polygon(annotation.bounding_box,
fill=self.COLOR, outline=self.COLOR)
draw.polygon(
annotation.bounding_box, fill=self.COLOR, outline=self.COLOR
)
def with_alpha_channel(self, mode):
"""See https://github.com/python-pillow/Pillow/blob/affa059e959280bf7826ec1a023a64cb8f111b6d/Tests/test_image_access.py#L185
@ -54,11 +65,7 @@ class ImageRedaction:
"YCbCr",
):
return False
elif mode in (
"LA",
"PA",
"RGBA"
):
elif mode in ("LA", "PA", "RGBA"):
return True
else:
raise Exception(f"Image mode \"{mode}\" is not supported.")
raise Exception(f'Image mode "{mode}" is not supported.')

Просмотреть файл

@ -2,96 +2,44 @@
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from typing import List, Set
from jsonpointer import resolve_pointer, set_pointer
from typing import List, Collection
from redact.redaction.ocr_result_redaction_v2 import OcrResultRedactionV2
from redact.redaction.ocr_result_redaction_v3 import OcrResultRedactionV3
from redact.types.annotation import Annotation
from redact.utils.bounding_box_mapping import similar
from redact.utils.redact_policy import first_char
from redact.types.api_version import ApiVersion
class OcrResultRedaction:
LINE_OVERLAP_THRESHOLD = 0.1
WORD_OVERLAP_THRESHOLD = 0.98
def __init__(self, ocr_result: dict, annotations: List[Annotation], labels_to_redact: List[str] = []):
def __init__(
self,
ocr_result: dict,
annotations: List[Annotation],
api_version: ApiVersion = ApiVersion.V3_0,
labels_to_redact: Collection[str] = tuple(),
):
self.ocr_result = ocr_result
self.annotations = annotations
self.labels_to_redact = labels_to_redact
self.api_version = api_version
def redact(self):
refs = []
for annot in self.annotations:
if len(self.labels_to_redact) == 0 or annot.field in self.labels_to_redact:
refs.extend(self.find_mapped_refs(annot))
self.redact_words(refs)
self.redact_lines(refs)
# Set is faster than List in this case.
self.redact_page_results(set(refs))
def find_mapped_refs(self, annot: Annotation):
refs = []
read_results = self.ocr_result["analyzeResult"]["readResults"]
for read_id, read_result in enumerate(read_results):
lines: List[dict] = read_result["lines"]
for line_id, line in enumerate(lines):
# Early rejection.
if not similar(annot.bounding_box, line["boundingBox"], self.LINE_OVERLAP_THRESHOLD):
continue
words: List[dict] = line["words"]
for word_id, word in enumerate(words):
if similar(annot.bounding_box, word["boundingBox"], self.WORD_OVERLAP_THRESHOLD):
refs.append(self.build_ref(read_id, line_id, word_id))
return refs
def redact_words(self, refs: List[str]):
def word_path(ref: str) -> str:
# Remove leading '#'.
return ref[1:]
for ref in refs:
r = word_path(ref)
word = resolve_pointer(self.ocr_result, r)
word["text"] = first_char(word["text"])
set_pointer(self.ocr_result, r, word)
def redact_lines(self, refs: List[str]):
def line_path(ref: str) -> str:
end = ref.find("/word")
# Remove leading '#' and trailing word path.
return ref[1:end]
for ref in refs:
r = line_path(ref)
line = resolve_pointer(self.ocr_result, r)
tokens = line["text"].split(' ')
word_id = int(ref.split('/')[-1])
tokens[word_id] = first_char(tokens[word_id])
line["text"] = ' '.join(tokens)
set_pointer(self.ocr_result, r, line)
def redact_page_results(self, refs: Set[str]):
def add_analyze_layer(elem: str) -> str:
return elem.replace('#/', '#/analyzeResult/')
page_results = self.ocr_result["analyzeResult"]["pageResults"]
for page_result in page_results:
tables: List[dict] = page_result["tables"]
for table in tables:
cells: List[dict] = table["cells"]
for cell in cells:
elements: List[str] = cell["elements"]
for elem_id, element in enumerate(elements):
full_elem = add_analyze_layer(element)
if full_elem in refs:
tokens = cell["text"].split(' ')
tokens[elem_id] = first_char(tokens[elem_id])
cell["text"] = ' '.join(tokens)
@ staticmethod
def build_ref(read_id: int, line_id: int, word_id: int) -> str:
return f'#/analyzeResult/readResults/{read_id}/lines/{line_id}/words/{word_id}'
if ApiVersion(self.api_version) in [
ApiVersion.V2_0,
ApiVersion.V2_1,
]:
redaction = OcrResultRedactionV2(
self.ocr_result,
self.annotations,
self.labels_to_redact,
)
redaction.redact()
elif ApiVersion(self.api_version) in [
ApiVersion.V3_0,
]:
redaction = OcrResultRedactionV3(
self.ocr_result,
self.annotations,
self.labels_to_redact,
)
redaction.redact()

Просмотреть файл

@ -0,0 +1,110 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from typing import List, Set, Collection
from jsonpointer import resolve_pointer, set_pointer
from redact.types.annotation import Annotation
from redact.utils.bounding_box_mapping import similar
from redact.utils.redact_policy import first_char
class OcrResultRedactionV2:
LINE_OVERLAP_THRESHOLD = 0.1
WORD_OVERLAP_THRESHOLD = 0.98
def __init__(
self,
ocr_result: dict,
annotations: List[Annotation],
labels_to_redact: Collection[str] = tuple(),
):
self.ocr_result = ocr_result
self.annotations = annotations
self.labels_to_redact = labels_to_redact
def redact(self):
refs = []
for annot in self.annotations:
if len(self.labels_to_redact) == 0 or annot.field in self.labels_to_redact:
refs.extend(self.find_mapped_refs(annot))
self.redact_words(refs)
self.redact_lines(refs)
# Set is faster than List in this case.
self.redact_page_results(set(refs))
def find_mapped_refs(self, annot: Annotation):
refs = []
read_results = self.ocr_result["analyzeResult"]["readResults"]
for read_id, read_result in enumerate(read_results):
lines: List[dict] = read_result["lines"]
for line_id, line in enumerate(lines):
# Early rejection.
if not similar(
annot.bounding_box,
line["boundingBox"],
self.LINE_OVERLAP_THRESHOLD,
):
continue
words: List[dict] = line["words"]
for word_id, word in enumerate(words):
if similar(
annot.bounding_box,
word["boundingBox"],
self.WORD_OVERLAP_THRESHOLD,
):
refs.append(self.build_ref(read_id, line_id, word_id))
return refs
def redact_words(self, refs: List[str]):
def word_path(ref: str) -> str:
# Remove leading '#'.
return ref[1:]
for ref in refs:
r = word_path(ref)
word = resolve_pointer(self.ocr_result, r)
word["text"] = first_char(word["text"])
set_pointer(self.ocr_result, r, word)
def redact_lines(self, refs: List[str]):
def line_path(ref: str) -> str:
end = ref.find("/word")
# Remove leading '#' and trailing word path.
return ref[1:end]
for ref in refs:
r = line_path(ref)
line = resolve_pointer(self.ocr_result, r)
tokens = line["text"].split(" ")
word_id = int(ref.split("/")[-1])
tokens[word_id] = first_char(tokens[word_id])
line["text"] = " ".join(tokens)
set_pointer(self.ocr_result, r, line)
def redact_page_results(self, refs: Set[str]):
def add_analyze_layer(elem: str) -> str:
return elem.replace("#/", "#/analyzeResult/")
page_results = self.ocr_result["analyzeResult"]["pageResults"]
for page_result in page_results:
tables: List[dict] = page_result["tables"]
for table in tables:
cells: List[dict] = table["cells"]
for cell in cells:
elements: List[str] = cell["elements"]
for elem_id, element in enumerate(elements):
full_elem = add_analyze_layer(element)
if full_elem in refs:
tokens = cell["text"].split(" ")
tokens[elem_id] = first_char(tokens[elem_id])
cell["text"] = " ".join(tokens)
@staticmethod
def build_ref(read_id: int, line_id: int, word_id: int) -> str:
return f"#/analyzeResult/readResults/{read_id}/lines/{line_id}/words/{word_id}"

Просмотреть файл

@ -0,0 +1,117 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from typing import List, Collection
from dacite import from_dict
from redact.types.annotation import Annotation
from redact.types.span import Span
from redact.utils.bounding_box_mapping import similar
from redact.utils.redact_policy import first_char
class OcrResultRedactionV3:
WORD_OVERLAP_THRESHOLD = 0.98
def __init__(
self,
ocr_result: dict,
annotations: List[Annotation],
labels_to_redact: Collection[str] = tuple(),
):
self.ocr_result = ocr_result
self.annotations = annotations
self.labels_to_redact = labels_to_redact
def redact(self):
words_to_redact = self.find_words_to_redact()
self.redact_words(words_to_redact)
spans = [
from_dict(data_class=Span, data=word["span"]) for word in words_to_redact
]
self.redact_lines(spans)
self.redact_content(spans)
self.redact_table(spans)
def find_words_to_redact(self):
words_to_redact = []
pages = self.ocr_result["analyzeResult"]["pages"]
for page in pages:
for annot in self.annotations:
if (
len(self.labels_to_redact) == 0
or annot.field in self.labels_to_redact
):
words = page["words"]
for word in words:
if similar(
annot.bounding_box,
word["boundingBox"],
self.WORD_OVERLAP_THRESHOLD,
):
words_to_redact.append(word)
break
return words_to_redact
def redact_words(self, words_to_redact):
for word in words_to_redact:
word["content"] = first_char(word["content"])
def redact_lines(self, spans: List[Span]):
pages = self.ocr_result["analyzeResult"]["pages"]
for redact_span in spans:
line_to_redact = self.get_line_to_redact(pages, redact_span)
if line_to_redact is not None:
line_spans = Span.from_dict_list(line_to_redact["spans"])
relative_span = redact_span.relative_to(line_spans)
line_to_redact["content"] = self.redact_text(
line_to_redact["content"], relative_span
)
def redact_content(self, spans: List[Span]):
content = self.ocr_result["analyzeResult"]["content"]
for span in spans:
content = self.redact_text(content, span)
self.ocr_result["analyzeResult"]["content"] = content
def redact_table(self, spans: List[Span]):
tables = self.ocr_result["analyzeResult"]["tables"]
for span in spans:
cell_to_redact = self.get_cell_to_redact(tables, span)
if cell_to_redact is not None:
cell_spans = Span.from_dict_list(cell_to_redact["spans"])
relative_span = span.relative_to(cell_spans)
cell_to_redact["content"] = self.redact_text(
cell_to_redact["content"], relative_span
)
def get_line_to_redact(self, pages, redact_span: Span):
for page in pages:
for line in page["lines"]:
line_spans = Span.from_dict_list(line["spans"])
if redact_span.inside(line_spans):
return line
return None
def get_cell_to_redact(self, tables, span: Span):
for table in tables:
for cell in table["cells"]:
cell_spans = Span.from_dict_list(cell["spans"])
if span.inside(cell_spans):
return cell
return None
@staticmethod
def redact_text(content: str, span: Span) -> str:
left = span.offset
right = span.offset + span.length
pre = content[:left]
text_to_redact = content[left:right]
post = content[right:]
redacted_text = first_char(text_to_redact)
return pre + redacted_text + post

Просмотреть файл

@ -9,5 +9,6 @@ from typing import List
@dataclass
class Annotation:
bounding_box: List[float]
page: int
field: str
text: str

Просмотреть файл

@ -0,0 +1,11 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from enum import Enum
class ApiVersion(Enum):
V2_0 = "v2.0"
V2_1 = "v2.1"
V3_0 = "v3.0"

Просмотреть файл

@ -7,9 +7,11 @@ import re
from typing import List, Any
from enum import Enum
class FileType(Enum):
IMAGE_ONLY = ".+(\\.jpeg|\\.jpg|\\.tif|\\.tiff|\\.png|\\.bmp)$"
PDF_ONLY = ".+(\\.pdf)$"
SINGLE_PAGE_IMAGE = ".+(\\.jpeg|\\.jpg|\\.png|\\.bmp)$"
MULTI_PAGE = ".+(\\.pdf|\\.tif|\\.tiff)$"
@dataclass
class FileBundle:
@ -31,9 +33,12 @@ class FileBundle:
ocr_file = img_file + ocr_suffix
if label_file in names and ocr_file in names:
ret.append(FileBundle(
image_file_name=img_file,
fott_file_name=label_file,
ocr_file_name=ocr_file))
ret.append(
FileBundle(
image_file_name=img_file,
fott_file_name=label_file,
ocr_file_name=ocr_file,
)
)
return ret

Просмотреть файл

@ -4,6 +4,7 @@
from dataclasses import dataclass
from typing import List, Dict, Tuple
from types import MappingProxyType
from redact.types.annotation import Annotation
@ -13,7 +14,7 @@ class Entity:
page: int
text: str
# camelCase instead of snake_case for aligning with the JSON schema.
boundingBoxes: List[List[float]]
boundingBoxes: List[List[float]] # noqa: N815
@dataclass
@ -26,7 +27,10 @@ class Label:
class FottLabel:
labels: List[Label]
def to_annotations(self, page_size: Dict[int, Tuple[float, float]] = {1: (1.0, 1.0)}) -> List[Annotation]:
def to_annotations(
self,
page_size: Dict[int, Tuple[float, float]] = MappingProxyType({1: (1.0, 1.0)}),
) -> List[Annotation]:
def to_pixel(page: int, bounding_box: List[float]) -> List[float]:
width = page_size[page][0]
height = page_size[page][1]
@ -44,7 +48,11 @@ class FottLabel:
for entity in label.value:
for bounding_box in entity.boundingBoxes:
annot = Annotation(
bounding_box=to_pixel(entity.page, bounding_box), field=label.label, text=entity.text)
bounding_box=to_pixel(entity.page, bounding_box),
field=label.label,
text=entity.text,
page=entity.page,
)
annotations.append(annot)
return annotations

Просмотреть файл

@ -0,0 +1,46 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from __future__ import annotations
from dataclasses import dataclass
from typing import List
import dacite
@dataclass
class Span:
offset: int
length: int
def includes(self, other: Span) -> bool:
return (
self.offset <= other.offset
and self.offset + self.length >= other.offset + other.length
)
def inside(self, others: List[Span]) -> bool:
return any(span.includes(self) for span in others)
def relative_to(self, others: List[Span]) -> Span:
if not self.inside(others):
raise ValueError("Self span is not inside target span list.")
offset = 0
for other in others:
if other.includes(self):
offset += self.offset - other.offset
break
else:
offset += other.length
return Span(offset=offset, length=self.length)
@staticmethod
def from_dict(data: dict) -> Span:
return dacite.from_dict(data_class=Span, data=data)
@staticmethod
def from_dict_list(data: List[dict]) -> List[Span]:
return [Span.from_dict(d) for d in data]

Просмотреть файл

@ -9,7 +9,11 @@ from shapely.geometry import Polygon
OVERLAP_THRESHOLD = 0.5
def similar(bounding_box_a: List[float], bounding_box_b: List[float], threshold=OVERLAP_THRESHOLD) -> bool:
def similar(
bounding_box_a: List[float],
bounding_box_b: List[float],
threshold=OVERLAP_THRESHOLD,
) -> bool:
a = Polygon(pairwise(bounding_box_a))
b = Polygon(pairwise(bounding_box_b))
base_area = min(a.area, b.area)
@ -20,6 +24,6 @@ def similar(bounding_box_a: List[float], bounding_box_b: List[float], threshold=
def pairwise(elements: List[float]) -> List[Tuple[float, float]]:
ret = []
for i in range(0, len(elements), 2):
pair = tuple([elements[i], elements[i+1]])
pair = tuple([elements[i], elements[i + 1]])
ret.append(pair)
return ret

Просмотреть файл

@ -5,21 +5,40 @@
import re
def valid_url(url: str):
def valid_url(url: str) -> bool:
# This is copied from django url validation regex.
# Source: https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r"^(?:http|ftp)s?://" # http:// or https://
# domain...
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"
r"localhost|" # localhost...
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
r"(?::\d+)?" # optional port
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)
return re.match(regex, url)
def get_redacted_file_name(name: str):
def get_redacted_file_name(name: str) -> str:
tokens = name.split(".")
tokens[0] = 'redacted_' + tokens[0]
return '.'.join(tokens)
tokens[0] = "redacted_" + tokens[0]
return ".".join(tokens)
def get_page_file_name(name: str, page: int, suffix: str = None) -> str:
if suffix is None:
return name + "." + str(page).zfill(3)
else:
return name + "." + str(page).zfill(3) + suffix
def is_pdf(name: str) -> bool:
regex = re.compile(".+(\\.pdf)$")
return re.match(regex, name)
def is_tiff(name: str) -> bool:
regex = re.compile(".+(\\.tiff?)$")
return re.match(regex, name)

Просмотреть файл

@ -15,10 +15,9 @@ def first_char(item: str) -> str:
# This also takes care of other common letter in Europe languages (Ø) and
# linguistic ligatures (Œ) instead of just A-Z.
ret = re.sub('[A-ZØÞŁꜲÆꜴꜶꜸꜺꜼǶŒꝎẞꜨꝠ]', 'A', ret)
ret = re.sub('[a-zøþıłꜳæꬱꜵꜷꜹꜻꜽ🙰ꭁƕỻœꝏßꜩꝡ]', 'a', ret)
ret = re.sub('[0-9]', '0', ret)
return ret
ret = re.sub("[A-ZØÞŁꜲÆꜴꜶꜸꜺꜼǶŒꝎẞꜨꝠ]", "A", ret)
ret = re.sub("[a-zøþıłꜳæꬱꜵꜷꜹꜻꜽ🙰ꭁƕỻœꝏßꜩꝡ]", "a", ret)
return re.sub("[0-9]", "0", ret)
def remove_diacritics(input_str: str) -> str:
@ -35,5 +34,5 @@ def remove_diacritics(input_str: str) -> str:
Returns:
str: The string without diacritics and typographical ligatures.
"""
nfkd_form = unicodedata.normalize('NFKD', input_str)
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
nfkd_form = unicodedata.normalize("NFKD", input_str)
return "".join([c for c in nfkd_form if not unicodedata.combining(c)])

Двоичные данные
scripts/redact_cli_py/requirements.txt

Двоичный файл не отображается.

Двоичные данные
scripts/redact_cli_py/testdata-redacted.jpg Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 235 KiB

Двоичные данные
scripts/redact_cli_py/testdata-redacted.tiff Normal file

Двоичный файл не отображается.

2479
scripts/redact_cli_py/testdata/customer.label.json поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Двоичные данные
scripts/redact_cli_py/testdata/customer.tif поставляемый Normal file

Двоичный файл не отображается.

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -9,29 +9,63 @@ from redact.types.annotation import Annotation
class AnnotationFactory:
def build_annotations() -> List[Annotation]:
annotations = [
Annotation(bounding_box=[375.0, 739.0, 517.0, 738.0, 517.0,
782.0, 375.0, 781.0],
field='Name', text='Aenean'),
Annotation(bounding_box=[1265.0, 1091.0, 1495.0, 1090.0, 1494.0,
1132.0, 1267.0, 1134.0],
field='Date', text='1900/01/01'),
Annotation(bounding_box=[1260.0, 1165.0, 1445.9999999999998,
1165.0, 1445.0, 1210.0, 1261.0, 1212.0],
field='Total', text='$3000.00')]
return annotations
return [
Annotation(
bounding_box=[375.0, 739.0, 517.0, 738.0, 517.0, 782.0, 375.0, 781.0],
field="Name",
text="Aenean",
page=1,
),
Annotation(
bounding_box=[
1265.0,
1091.0,
1495.0,
1090.0,
1494.0,
1132.0,
1267.0,
1134.0,
],
field="Date",
text="1900/01/01",
page=1,
),
Annotation(
bounding_box=[
1260.0,
1165.0,
1445.9999999999998,
1165.0,
1445.0,
1210.0,
1261.0,
1212.0,
],
field="Total",
text="$3000.00",
page=1,
),
]
def build_annotations_mode_1() -> List[Annotation]:
annotations = [
Annotation(bounding_box=[76, 105, 104, 105,
104, 111, 76, 111],
field='Name', text=''),
Annotation(bounding_box=[255, 155, 301, 155,
301, 161, 255, 162],
field='Date', text=''),
Annotation(bounding_box=[254, 166, 291, 166,
291, 172, 254, 173],
field='Total', text='')]
return annotations
return [
Annotation(
bounding_box=[76, 105, 104, 105, 104, 111, 76, 111],
field="Name",
text="",
page=1,
),
Annotation(
bounding_box=[255, 155, 301, 155, 301, 161, 255, 162],
field="Date",
text="",
page=1,
),
Annotation(
bounding_box=[254, 166, 291, 166, 291, 172, 254, 173],
field="Total",
text="",
page=1,
),
]

Просмотреть файл

@ -10,39 +10,34 @@ from redact.types.fott_label import FottLabel
class FottLabelFactory:
@staticmethod
def build() -> FottLabel:
fott_label_path = "testdata/testdata.jpg.labels.json"
with open(fott_label_path, encoding='utf-8-sig') as fott_label_json:
with open(fott_label_path, encoding="utf-8-sig") as fott_label_json:
fott_label_dict = json.load(fott_label_json)
fott_label = from_dict(data_class=FottLabel, data=fott_label_dict)
return fott_label
return from_dict(data_class=FottLabel, data=fott_label_dict)
@staticmethod
def build_redacted() -> FottLabel:
fott_label_path = "testdata/testdata.redacted.labels.json"
with open(fott_label_path, encoding='utf-8-sig') as fott_label_json:
with open(fott_label_path, encoding="utf-8-sig") as fott_label_json:
fott_label_dict = json.load(fott_label_json)
fott_label = from_dict(data_class=FottLabel, data=fott_label_dict)
return fott_label
return from_dict(data_class=FottLabel, data=fott_label_dict)
@staticmethod
def build_partial() -> FottLabel:
fott_label_path = "testdata/testdata-partial.jpg.labels.json"
with open(fott_label_path, encoding='utf-8-sig') as fott_label_json:
with open(fott_label_path, encoding="utf-8-sig") as fott_label_json:
fott_label_dict = json.load(fott_label_json)
fott_label = from_dict(data_class=FottLabel, data=fott_label_dict)
return fott_label
return from_dict(data_class=FottLabel, data=fott_label_dict)
@staticmethod
def build_redacted_partial() -> FottLabel:
fott_label_path = "testdata/testdata-partial.redacted.labels.json"
with open(fott_label_path, encoding='utf-8-sig') as fott_label_json:
with open(fott_label_path, encoding="utf-8-sig") as fott_label_json:
fott_label_dict = json.load(fott_label_json)
fott_label = from_dict(data_class=FottLabel, data=fott_label_dict)
return fott_label
return from_dict(data_class=FottLabel, data=fott_label_dict)

Просмотреть файл

@ -6,7 +6,6 @@ from PIL import Image
class ImageFactory:
@staticmethod
def build() -> Image:
image_path = "testdata/testdata.jpg"

Просмотреть файл

@ -6,35 +6,44 @@ import json
class OcrResultFactory:
@staticmethod
def build() -> dict:
ocr_result_path = "testdata/testdata.jpg.ocr.json"
with open(ocr_result_path, encoding='utf-8-sig') as ocr_result_json:
ocr_result_dict = json.load(ocr_result_json)
return ocr_result_dict
with open(ocr_result_path, encoding="utf-8-sig") as ocr_result_json:
return json.load(ocr_result_json)
@staticmethod
def build_redacted() -> dict:
ocr_result_path = "testdata/testdata.redacted.ocr.json"
with open(ocr_result_path, encoding='utf-8-sig') as ocr_result_json:
ocr_result_dict = json.load(ocr_result_json)
return ocr_result_dict
with open(ocr_result_path, encoding="utf-8-sig") as ocr_result_json:
return json.load(ocr_result_json)
@staticmethod
def build_partial() -> dict:
ocr_result_path = "testdata/testdata-partial.jpg.ocr.json"
with open(ocr_result_path, encoding='utf-8-sig') as ocr_result_json:
ocr_result_dict = json.load(ocr_result_json)
return ocr_result_dict
with open(ocr_result_path, encoding="utf-8-sig") as ocr_result_json:
return json.load(ocr_result_json)
@staticmethod
def build_redacted_partial() -> dict:
ocr_result_path = "testdata/testdata-partial.redacted.ocr.json"
with open(ocr_result_path, encoding='utf-8-sig') as ocr_result_json:
ocr_result_dict = json.load(ocr_result_json)
return ocr_result_dict
with open(ocr_result_path, encoding="utf-8-sig") as ocr_result_json:
return json.load(ocr_result_json)
@staticmethod
def build_2021_09_30_preview() -> dict:
ocr_result_path = "testdata/testdata.jpg.2021-09-30-preview.ocr.json"
with open(ocr_result_path, encoding="utf-8-sig") as ocr_result_json:
return json.load(ocr_result_json)
@staticmethod
def build_redacted_2021_09_30_preview() -> dict:
ocr_result_path = "testdata/testdata.jpg.2021-09-30-preview.redacted.ocr.json"
with open(ocr_result_path, encoding="utf-8-sig") as ocr_result_json:
return json.load(ocr_result_json)

Просмотреть файл

@ -42,7 +42,7 @@ class TestImageRedaction:
expected_image = ImageFactory.build_redacted_partial()
annotations = AnnotationFactory.build_annotations()
image_redaction = ImageRedaction(image, annotations, ["Name","Date"])
image_redaction = ImageRedaction(image, annotations, ["Name", "Date"])
image_redaction.redact()
diff = ImageChops.difference(image_redaction.image, expected_image)

Просмотреть файл

@ -2,37 +2,62 @@
# Licensed under the MIT License. See License.txt in the project
# root for license information.
import pytest
from redact.redaction.ocr_result_redaction import OcrResultRedaction
from redact.redaction.ocr_result_redaction_v2 import OcrResultRedactionV2
from redact.redaction.ocr_result_redaction_v3 import OcrResultRedactionV3
from redact.types.api_version import ApiVersion
from tests.factories.ocr_result_factory import OcrResultFactory
from tests.factories.annotation_factory import AnnotationFactory
class TestOcrResultRedaction:
def test_ctor(self) -> None:
@pytest.mark.parametrize(
"api_version",
[
ApiVersion.V2_0,
ApiVersion.V2_1,
],
)
def test_redact_v2(self, api_version) -> None:
ocr_result = OcrResultFactory.build()
annotations = AnnotationFactory.build_annotations()
ocr_result_redaction = OcrResultRedaction(ocr_result, annotations)
assert ocr_result_redaction.ocr_result == ocr_result
def test_redact(self) -> None:
ocr_result = OcrResultFactory.build()
expected = OcrResultFactory.build_redacted()
annotations = AnnotationFactory.build_annotations()
ocr_result_redaction = OcrResultRedaction(ocr_result, annotations)
ocr_result_redaction = OcrResultRedaction(
ocr_result,
annotations,
api_version,
)
ocr_result_redaction.redact()
actual = ocr_result_redaction.ocr_result
assert actual == expected
v2 = OcrResultRedactionV2(
ocr_result,
annotations,
)
v2.redact()
def test_redact_partial(self) -> None:
ocr_result = OcrResultFactory.build_partial()
expected = OcrResultFactory.build_redacted_partial()
assert ocr_result_redaction.ocr_result == v2.ocr_result
@pytest.mark.parametrize(
"api_version",
[
ApiVersion.V3_0,
],
)
def test_redact_v3(self, api_version) -> None:
ocr_result = OcrResultFactory.build_2021_09_30_preview()
annotations = AnnotationFactory.build_annotations()
ocr_result_redaction = OcrResultRedaction(ocr_result, annotations, ["Name", "Date"])
ocr_result_redaction = OcrResultRedaction(
ocr_result,
annotations,
api_version,
)
ocr_result_redaction.redact()
actual = ocr_result_redaction.ocr_result
assert actual == expected
v3 = OcrResultRedactionV3(
ocr_result,
annotations,
)
v3.redact()
assert ocr_result_redaction.ocr_result == v3.ocr_result

Просмотреть файл

@ -0,0 +1,45 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from redact.redaction.ocr_result_redaction_v2 import OcrResultRedactionV2
from tests.factories.ocr_result_factory import OcrResultFactory
from tests.factories.annotation_factory import AnnotationFactory
class TestOcrResultRedactionV2:
def test_ctor(self) -> None:
ocr_result = OcrResultFactory.build()
annotations = AnnotationFactory.build_annotations()
ocr_result_redaction = OcrResultRedactionV2(ocr_result, annotations)
assert ocr_result_redaction.ocr_result == ocr_result
def test_redact(self) -> None:
ocr_result = OcrResultFactory.build()
expected = OcrResultFactory.build_redacted()
annotations = AnnotationFactory.build_annotations()
ocr_result_redaction = OcrResultRedactionV2(
ocr_result,
annotations,
)
ocr_result_redaction.redact()
actual = ocr_result_redaction.ocr_result
assert actual == expected
def test_redact_partial(self) -> None:
ocr_result = OcrResultFactory.build_partial()
expected = OcrResultFactory.build_redacted_partial()
annotations = AnnotationFactory.build_annotations()
ocr_result_redaction = OcrResultRedactionV2(
ocr_result,
annotations,
["Name", "Date"],
)
ocr_result_redaction.redact()
actual = ocr_result_redaction.ocr_result
assert actual == expected

Просмотреть файл

@ -0,0 +1,30 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from redact.redaction.ocr_result_redaction_v3 import OcrResultRedactionV3
from tests.factories.ocr_result_factory import OcrResultFactory
from tests.factories.annotation_factory import AnnotationFactory
class TestOcrResultRedactionV3:
def test_ctor(self) -> None:
ocr_result = OcrResultFactory.build_2021_09_30_preview()
annotations = AnnotationFactory.build_annotations()
ocr_result_redaction = OcrResultRedactionV3(ocr_result, annotations)
assert ocr_result_redaction.ocr_result == ocr_result
def test_redact(self) -> None:
ocr_result = OcrResultFactory.build_2021_09_30_preview()
expected = OcrResultFactory.build_redacted_2021_09_30_preview()
annotations = AnnotationFactory.build_annotations()
ocr_result_redaction = OcrResultRedactionV3(
ocr_result,
annotations,
)
ocr_result_redaction.redact()
actual = ocr_result_redaction.ocr_result
assert actual == expected

Просмотреть файл

@ -2,13 +2,13 @@
# Licensed under the MIT License. See License.txt in the project
# root for license information.
from PIL import ImageChops, ImageStat, Image
from PIL import ImageChops, ImageStat
from redact.utils.pdf_renderer import PdfRenderer
from redact.preprocess.pdf_renderer import PdfRenderer
from tests.factories.image_factory import ImageFactory
class TestPdfRendering:
class TestPdfRendering:
def test_rendering(self) -> None:
# A small tolerance epsilon because of the jpg compression loss.
epsilon = 0.1

Просмотреть файл

@ -8,17 +8,16 @@ from redact.types.file_bundle import FileType
class TestFileBundle:
def test_from_names(self) -> None:
names = [
"a.jpg",
"a.jpg.labels.json",
"dummy_file.jpg",
"a.jpg.ocr.json"]
expected = [FileBundle(
image_file_name="a.jpg",
fott_file_name="a.jpg.labels.json",
ocr_file_name="a.jpg.ocr.json")]
names = ["a.jpg", "a.jpg.labels.json", "dummy_file.jpg", "a.jpg.ocr.json"]
expected = [
FileBundle(
image_file_name="a.jpg",
fott_file_name="a.jpg.labels.json",
ocr_file_name="a.jpg.ocr.json",
)
]
actual = FileBundle.from_names(names, FileType.IMAGE_ONLY)
actual = FileBundle.from_names(names, FileType.SINGLE_PAGE_IMAGE)
assert actual == expected
@ -30,12 +29,16 @@ class TestFileBundle:
"a.jpg",
"a.jpg.labels.json",
"dummy_file.pdf",
"a.pdf.ocr.json"]
expected = [FileBundle(
image_file_name="a.pdf",
fott_file_name="a.pdf.labels.json",
ocr_file_name="a.pdf.ocr.json")]
"a.pdf.ocr.json",
]
expected = [
FileBundle(
image_file_name="a.pdf",
fott_file_name="a.pdf.labels.json",
ocr_file_name="a.pdf.ocr.json",
)
]
actual = FileBundle.from_names(names, FileType.PDF_ONLY)
actual = FileBundle.from_names(names, FileType.MULTI_PAGE)
assert actual == expected

Просмотреть файл

@ -24,8 +24,10 @@ class TestFottLabel:
bbox = annotations[0].bounding_box
for i, element in enumerate(bbox):
bbox[i] = element * 10
annotations[0].page = 2
actual = fott_label.to_annotations(
page_size={1: (2481, 3509), 2: (24810, 35090)})
page_size={1: (2481, 3509), 2: (24810, 35090)}
)
assert actual == annotations

Просмотреть файл

@ -11,7 +11,7 @@ class TestRedactPolicy:
actual = first_char(text)
assert "" == actual
def test_first_char_Apple(self) -> None:
def test_first_char_apple(self) -> None:
text = "Apple"
actual = first_char(text)
assert "Aaaaa" == actual