Update the find corpus tool to provide more information (#280)

* Add pytest-clarity for better text diffs in tests

* Add requests_mock for tests

* Add the test_data artifact to the .gitignore

* Use an underscore with find_corpus.py

* Update the find corpus tool to provide more information

* Add humanize to the dependency list
This commit is contained in:
Greg Tatum 2023-12-12 15:08:59 -06:00 коммит произвёл GitHub
Родитель ae781439d0
Коммит d1be2bca4a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 589 добавлений и 90 удалений

2
.gitignore поставляемый
Просмотреть файл

@ -136,3 +136,5 @@ dmypy.json
.models
.bin
.snakemake
tests_data

Просмотреть файл

@ -164,8 +164,8 @@ fix-all:
# Run unit tests
run-tests:
poetry install --only tests
PYTHONPATH=$$(pwd) poetry run pytest tests
poetry install --only tests --only utils
PYTHONPATH=$$(pwd) poetry run pytest tests -vv
# Validates Taskcluster task graph locally
validate-taskgraph:

138
poetry.lock сгенерированный
Просмотреть файл

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
[[package]]
name = "absl-py"
@ -1021,6 +1021,20 @@ files = [
[package.dependencies]
pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""}
[[package]]
name = "humanize"
version = "4.9.0"
description = "Python humanize utilities"
optional = false
python-versions = ">=3.8"
files = [
{file = "humanize-4.9.0-py3-none-any.whl", hash = "sha256:ce284a76d5b1377fd8836733b983bfb0b76f1aa1c090de2566fcf008d7f6ab16"},
{file = "humanize-4.9.0.tar.gz", hash = "sha256:582a265c931c683a7e9b8ed9559089dea7edcf6cc95be39a3cbc2c5d5ac2bcfa"},
]
[package.extras]
tests = ["freezegun", "pytest", "pytest-cov"]
[[package]]
name = "idna"
version = "3.4"
@ -1189,6 +1203,30 @@ files = [
docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"]
testing = ["coverage", "pyyaml"]
[[package]]
name = "markdown-it-py"
version = "3.0.0"
description = "Python port of markdown-it. Markdown parsing, done right!"
optional = false
python-versions = ">=3.8"
files = [
{file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
{file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
]
[package.dependencies]
mdurl = ">=0.1,<1.0"
[package.extras]
benchmarking = ["psutil", "pytest", "pytest-benchmark"]
code-style = ["pre-commit (>=3.0,<4.0)"]
compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
linkify = ["linkify-it-py (>=1,<3)"]
plugins = ["mdit-py-plugins"]
profiling = ["gprof2dot"]
rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
[[package]]
name = "markupsafe"
version = "2.1.3"
@ -1258,6 +1296,17 @@ files = [
{file = "MarkupSafe-2.1.3.tar.gz", hash = "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad"},
]
[[package]]
name = "mdurl"
version = "0.1.2"
description = "Markdown URL utilities"
optional = false
python-versions = ">=3.7"
files = [
{file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
]
[[package]]
name = "mohawk"
version = "1.1.0"
@ -1720,6 +1769,17 @@ docs = ["sphinx (>=1.7.1)"]
redis = ["redis"]
tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-flake8 (>=1.0.5)", "pytest-mypy (>=0.8.0)", "redis", "sphinx (>=3.0.3)"]
[[package]]
name = "pprintpp"
version = "0.4.0"
description = "A drop-in replacement for pprint that's actually pretty"
optional = false
python-versions = "*"
files = [
{file = "pprintpp-0.4.0-py2.py3-none-any.whl", hash = "sha256:b6b4dcdd0c0c0d75e4d7b2f21a9e933e5b2ce62b26e1a54537f9651ae5a5c01d"},
{file = "pprintpp-0.4.0.tar.gz", hash = "sha256:ea826108e2c7f49dc6d66c752973c3fc9749142a798d6b254e1e301cfdbc6403"},
]
[[package]]
name = "prefixed"
version = "0.7.0"
@ -2008,6 +2068,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-clarity"
version = "1.0.1"
description = "A plugin providing an alternative, colourful diff output for failing assertions."
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "pytest-clarity-1.0.1.tar.gz", hash = "sha256:505fe345fad4fe11c6a4187fe683f2c7c52c077caa1e135f3e483fe112db7772"},
]
[package.dependencies]
pprintpp = ">=0.4.0"
pytest = ">=3.5.0"
rich = ">=8.0.0"
[[package]]
name = "python-dateutil"
version = "2.8.2"
@ -2068,7 +2143,6 @@ files = [
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
{file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
{file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
{file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
{file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@ -2076,15 +2150,8 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
{file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
{file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
{file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
{file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@ -2101,7 +2168,6 @@ files = [
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
{file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
{file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
{file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
{file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@ -2109,7 +2175,6 @@ files = [
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
{file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
{file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
{file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
{file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@ -2234,6 +2299,25 @@ urllib3 = ">=1.21.1,<1.27"
socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
use-chardet-on-py3 = ["chardet (>=3.0.2,<5)"]
[[package]]
name = "requests-mock"
version = "1.11.0"
description = "Mock out responses from the requests package"
optional = false
python-versions = "*"
files = [
{file = "requests-mock-1.11.0.tar.gz", hash = "sha256:ef10b572b489a5f28e09b708697208c4a3b2b89ef80a9f01584340ea357ec3c4"},
{file = "requests_mock-1.11.0-py2.py3-none-any.whl", hash = "sha256:f7fae383f228633f6bececebdab236c478ace2284d6292c6e7e2867b9ab74d15"},
]
[package.dependencies]
requests = ">=2.3,<3"
six = "*"
[package.extras]
fixture = ["fixtures"]
test = ["fixtures", "mock", "purl", "pytest", "requests-futures", "sphinx", "testtools"]
[[package]]
name = "requests-oauthlib"
version = "1.3.1"
@ -2252,6 +2336,24 @@ requests = ">=2.0.0"
[package.extras]
rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
[[package]]
name = "rich"
version = "13.7.0"
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
optional = false
python-versions = ">=3.7.0"
files = [
{file = "rich-13.7.0-py3-none-any.whl", hash = "sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"},
{file = "rich-13.7.0.tar.gz", hash = "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa"},
]
[package.dependencies]
markdown-it-py = ">=2.2.0"
pygments = ">=2.13.0,<3.0.0"
[package.extras]
jupyter = ["ipywidgets (>=7.5.1,<9)"]
[[package]]
name = "rsa"
version = "4.9"
@ -2293,24 +2395,24 @@ python-versions = ">=3.6"
files = [
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b42169467c42b692c19cf539c38d4602069d8c1505e97b86387fcf7afb766e1d"},
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:07238db9cbdf8fc1e9de2489a4f68474e70dffcb32232db7c08fa61ca0c7c462"},
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:d92f81886165cb14d7b067ef37e142256f1c6a90a65cd156b063a43da1708cfd"},
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fff3573c2db359f091e1589c3d7c5fc2f86f5bdb6f24252c2d8e539d4e45f412"},
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:aa2267c6a303eb483de8d02db2871afb5c5fc15618d894300b88958f729ad74f"},
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:840f0c7f194986a63d2c2465ca63af8ccbbc90ab1c6001b1978f05119b5e7334"},
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:024cfe1fc7c7f4e1aff4a81e718109e13409767e4f871443cbff3dba3578203d"},
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-win32.whl", hash = "sha256:c69212f63169ec1cfc9bb44723bf2917cbbd8f6191a00ef3410f5a7fe300722d"},
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-win_amd64.whl", hash = "sha256:cabddb8d8ead485e255fe80429f833172b4cadf99274db39abc080e068cbcc31"},
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:bef08cd86169d9eafb3ccb0a39edb11d8e25f3dae2b28f5c52fd997521133069"},
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:b16420e621d26fdfa949a8b4b47ade8810c56002f5389970db4ddda51dbff248"},
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:b5edda50e5e9e15e54a6a8a0070302b00c518a9d32accc2346ad6c984aacd279"},
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:25c515e350e5b739842fc3228d662413ef28f295791af5e5110b543cf0b57d9b"},
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux_2_24_aarch64.whl", hash = "sha256:1707814f0d9791df063f8c19bb51b0d1278b8e9a2353abbb676c2f685dee6afe"},
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:46d378daaac94f454b3a0e3d8d78cafd78a026b1d71443f4966c696b48a6d899"},
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:09b055c05697b38ecacb7ac50bdab2240bfca1a0c4872b0fd309bb07dc9aa3a9"},
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-win32.whl", hash = "sha256:53a300ed9cea38cf5a2a9b069058137c2ca1ce658a874b79baceb8f892f915a7"},
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-win_amd64.whl", hash = "sha256:c2a72e9109ea74e511e29032f3b670835f8a59bbdc9ce692c5b4ed91ccf1eedb"},
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ebc06178e8821efc9692ea7544aa5644217358490145629914d8020042c24aa1"},
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:edaef1c1200c4b4cb914583150dcaa3bc30e592e907c01117c08b13a07255ec2"},
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:7048c338b6c86627afb27faecf418768acb6331fc24cfa56c93e8c9780f815fa"},
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d176b57452ab5b7028ac47e7b3cf644bcfdc8cacfecf7e71759f7f51a59e5c92"},
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux_2_24_aarch64.whl", hash = "sha256:1dc67314e7e1086c9fdf2680b7b6c2be1c0d8e3a8279f2e993ca2a7545fecf62"},
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3213ece08ea033eb159ac52ae052a4899b56ecc124bb80020d9bbceeb50258e9"},
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aab7fd643f71d7946f2ee58cc88c9b7bfc97debd71dcc93e03e2d174628e7e2d"},
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-win32.whl", hash = "sha256:5c365d91c88390c8d0a8545df0b5857172824b1c604e867161e6b3d59a827eaa"},
@ -2318,7 +2420,7 @@ files = [
{file = "ruamel.yaml.clib-0.2.8-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a5aa27bad2bb83670b71683aae140a1f52b0857a2deff56ad3f6c13a017a26ed"},
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c58ecd827313af6864893e7af0a3bb85fd529f862b6adbefe14643947cfe2942"},
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-macosx_12_0_arm64.whl", hash = "sha256:f481f16baec5290e45aebdc2a5168ebc6d35189ae6fea7a58787613a25f6e875"},
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3fcc54cb0c8b811ff66082de1680b4b14cf8a81dce0d4fbf665c2265a81e07a1"},
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:77159f5d5b5c14f7c34073862a6b7d34944075d9f93e681638f6d753606c6ce6"},
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7f67a1ee819dc4562d444bbafb135832b0b909f81cc90f7aa00260968c9ca1b3"},
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4ecbf9c3e19f9562c7fdd462e8d18dd902a47ca046a2e64dba80699f0b6c09b7"},
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:87ea5ff66d8064301a154b3933ae406b0863402a799b16e4a1d24d9fbbcbe0d3"},
@ -2326,7 +2428,7 @@ files = [
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-win_amd64.whl", hash = "sha256:3f215c5daf6a9d7bbed4a0a4f760f3113b10e82ff4c5c44bec20a68c8014f675"},
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1b617618914cb00bf5c34d4357c37aa15183fa229b24767259657746c9077615"},
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a6a9ffd280b71ad062eae53ac1659ad86a17f59a0fdc7699fd9be40525153337"},
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:665f58bfd29b167039f714c6998178d27ccd83984084c286110ef26b230f259f"},
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:305889baa4043a09e5b76f8e2a51d4ffba44259f6b4c72dec8ca56207d9c6fe1"},
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:700e4ebb569e59e16a976857c8798aee258dceac7c7d6b50cab63e080058df91"},
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e2b4c44b60eadec492926a7270abb100ef9f72798e18743939bdbf037aab8c28"},
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e79e5db08739731b0ce4850bed599235d601701d5694c36570a99a0c5ca41a9d"},
@ -2334,7 +2436,7 @@ files = [
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-win_amd64.whl", hash = "sha256:56f4252222c067b4ce51ae12cbac231bce32aee1d33fbfc9d17e5b8d6966c312"},
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:03d1162b6d1df1caa3a4bd27aa51ce17c9afc2046c31b0ad60a0a96ec22f8001"},
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:bba64af9fa9cebe325a62fa398760f5c7206b215201b0ec825005f1b18b9bccf"},
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:9eb5dee2772b0f704ca2e45b1713e4e5198c18f515b52743576d196348f374d3"},
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:a1a45e0bb052edf6a1d3a93baef85319733a888363938e1fc9924cb00c8df24c"},
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:da09ad1c359a728e112d60116f626cc9f29730ff3e0e7db72b9a2dbc2e4beed5"},
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:184565012b60405d93838167f425713180b949e9d8dd0bbc7b49f074407c5a8b"},
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a75879bacf2c987c003368cf14bed0ffe99e8e85acfa6c0bfffc21a090f16880"},
@ -2995,4 +3097,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "3e4e51ed7309819903d851969d04f5c12c43f63aafa46b8a5983512a40d20bf7"
content-hash = "75996c59b50c2ad361524908ad9e6cd88a21b782e6d99adaa39fa4bf57ed70b4"

Просмотреть файл

@ -29,6 +29,7 @@ marian-tensorboard = "^0.2.1"
sacrebleu="2.0.0"
mtdata="0.3.2"
requests="2.26.0"
humanize = "^4.9.0"
[tool.poetry.group.tests.dependencies]
sacrebleu="2.0.0"
@ -37,6 +38,8 @@ requests="2.26.0"
pytest="7.4.3"
# use the latest main, switch to PyPi when released
opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="9133e1525c7ee37f53ea14ee6a180152bf7ea192"}
pytest-clarity = "^1.0.1"
requests-mock = "^1.11.0"
[tool.black]
extend-exclude= "/3rd_party"

154
tests/test_find_corpus.py Normal file
Просмотреть файл

@ -0,0 +1,154 @@
from textwrap import dedent
import pytest
from utils.find_corpus import main as find_corpus
"""
Tests the `utils/find_corpus.py` script.
"""
@pytest.fixture
def mock_opus_data(requests_mock):
"""
Provide a simplistic response from opus, with only 2 entries.
"""
requests_mock.get(
"https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest",
text="""{
"corpora": [
{
"alignment_pairs": 4605,
"corpus": "Books",
"documents": "",
"id": 31736,
"latest": "True",
"preprocessing": "moses",
"size": 328,
"source": "ca",
"source_tokens": 73463,
"target": "en",
"target_tokens": 68625,
"url": "https://object.pouta.csc.fi/OPUS-Books/v1/moses/ca-en.txt.zip",
"version": "v1"
},
{
"alignment_pairs": 5802549,
"corpus": "CCAligned",
"documents": "",
"id": 32571,
"latest": "True",
"preprocessing": "moses",
"size": 522860,
"source": "ca",
"source_tokens": 89704109,
"target": "en",
"target_tokens": 84373417,
"url": "https://object.pouta.csc.fi/OPUS-CCAligned/v1/moses/ca-en.txt.zip",
"version": "v1"
}
]
}""",
)
def assert_stdout(capsys, message: str, expected_output: str):
"""
Asserts the output from stdout matches a certain string.
"""
captured = capsys.readouterr()
def clean_text(text):
text = dedent(text).strip()
result = ""
for line in text.split("\n"):
result += line.strip() + "\n"
return result
assert clean_text(captured.out) == clean_text(expected_output), message
def test_opus(mock_opus_data, capsys):
find_corpus(["en", "ca", "--importer", "opus"])
assert_stdout(
capsys,
"The opus dataset outputs nicely.",
"""
Fetching datasets from:
https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest
OPUS - https://opus.nlpl.eu/
Dataset Code Sentences Size URL
CCAligned opus_CCAligned/v1 5802549 535.4 MB https://opus.nlpl.eu/CCAligned-v1.php
Books opus_Books/v1 4605 335.9 kB https://opus.nlpl.eu/Books-v1.php
YAML:
- opus_Books/v1
- opus_CCAligned/v1
""",
)
def test_opus_download_url(mock_opus_data, capsys):
"""
This checks that the download URLs are shown instead of the information URLs.
"""
find_corpus(["en", "ca", "--importer", "opus", "--download_url"])
output = capsys.readouterr()
assert "https://object.pouta.csc.fi/OPUS-CCAligned/v1/moses/ca-en.txt.zip" in output.out
assert "https://object.pouta.csc.fi/OPUS-Books/v1/moses/ca-en.txt.zip" in output.out
# mtdata has some deprecated dependencies
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_mtdata(requests_mock, capsys):
find_corpus(["en", "ca", "--importer", "mtdata"])
assert_stdout(
capsys,
"mtdata outputs nicely",
"""
mtdata - https://github.com/thammegowda/mtdata
Dataset URL
mtdata_ELRC-wikipedia_health-1-cat-eng https://elrc-share.eu/repository/download/ac6d557e8de811ea913100155d026706b0c5fee96b88489781ddd7675f8ea2ae/
mtdata_Facebook-wikimatrix-1-cat-eng https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.ca-en.tsv.gz
mtdata_Statmt-ccaligned-1-cat_ES-eng http://www.statmt.org/cc-aligned/sentence-aligned/ca_ES-en_XX.tsv.xz
YAML:
- mtdata_ELRC-wikipedia_health-1-cat-eng
- mtdata_Facebook-wikimatrix-1-cat-eng
- mtdata_Statmt-ccaligned-1-cat_ES-eng
""",
)
def test_sacrebleu(requests_mock, capsys):
# "iu" is the Inuktitut language, which has a small dataset available.
find_corpus(["en", "iu", "--importer", "sacrebleu"])
assert_stdout(
capsys,
"sacrebleu outputs nicely",
"""
sacrebleu - https://github.com/mjpost/sacrebleu
Dataset Description URLs
wmt20 Official evaluation data for WMT20 http://data.statmt.org/wmt20/translation-task/test.tgz
wmt20/dev Development data for tasks new to 2020. http://data.statmt.org/wmt20/translation-task/dev.tgz
YAML:
- sacrebleu_wmt20
- sacrebleu_wmt20/dev
""",
)

Просмотреть файл

@ -1,70 +0,0 @@
#!/usr/bin/env python3
"""
Finds all opus datasets for a language pair and prints them to set config settings.
Usage:
python find-corpus.py <src> <trg> <importer>
Params:
src - source language code
trg - target language code
importer - importer type (mtdata, opus, sacrebleu)
"""
import sys
import requests
source = sys.argv[1]
target = sys.argv[2]
type = sys.argv[3]
# exclude = ['bible', 'Ubuntu', 'Gnome', 'KDE', 'Multi', 'OPUS100v']
exclude = []
names = []
if type == "opus":
exclude += ["OPUS100v", "WMT-News"]
datasets = requests.get(
f"https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest"
).json()
names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets["corpora"]]
elif type == "sacrebleu":
import sacrebleu
names = [
f"sacrebleu_{name}"
for name, meta in sacrebleu.DATASETS.items()
if f"{source}-{target}" in meta or f"{target}-{source}" in meta
]
elif type == "mtdata":
from mtdata.entry import lang_pair
from mtdata.index import get_entries
from mtdata.iso import iso3_code
source_tricode = iso3_code(source, fail_error=True)
target_tricode = iso3_code(target, fail_error=True)
exclude += ["opus", "newstest", "UNv1"]
entries = sorted(
get_entries(lang_pair(source_tricode + "-" + target_tricode), None, None, True),
key=lambda entry: entry.did.group,
)
names = [
f"mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}"
for entry in entries
]
else:
print(f"Importer type {type} is unsupported. Supported importers: opus, mtdata, sacrebleu")
cleaned = set()
for name in names:
filter = False
for ex in exclude:
if ex.lower() in name.lower():
filter = True
break
if not filter:
cleaned.add(name)
print("\n".join(sorted([f" - {name}" for name in cleaned])))

308
utils/find_corpus.py Executable file
Просмотреть файл

@ -0,0 +1,308 @@
#!/usr/bin/env python3
"""
Finds all opus datasets for a language pair and prints them to set config settings.
Usage:
poetry install --only utils
poetry run ./utils/find_corpus.py "en" "ca"
poetry run ./utils/find_corpus.py "en" "fr" --importer opus
"""
import argparse
import logging
import sys
from typing import NamedTuple, Optional, TypeVar, Union
import humanize
import requests
class OpusDataset(NamedTuple):
# The name of this dataset, e.g. "CCAligned"
corpus: str
# This is a blank string at the time of this writing.
documents: str
# 'moses'
preprocessing: str
# The language tag.
source: str
# The language tag.
target: str
# The URL to the download
url: str
# For example "v1"
version: str
alignment_pairs: int
id: int
# Size in KiB
size: int
source_tokens: int
target_tokens: int
latest: Union["True", "False"]
def name(self) -> str:
return f"opus_{self.corpus}/{self.version}"
def website_url(self) -> str:
return f"https://opus.nlpl.eu/{self.corpus}-{self.version}.php"
def humanize_size(self) -> str:
return humanize.naturalsize(self.size * 1024)
def get_opus(source: str, target: str, download_url: bool):
# This API is documented: https://opus.nlpl.eu/opusapi/
url = f"https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest"
print(f"Fetching datasets from:\n{url}\n")
datasets = requests.get(url).json()
# Convert the response into a typed object that is sorted.
datasets_typed = [OpusDataset(**corpus_data) for corpus_data in datasets.get("corpora", [])]
datasets_typed = sorted(datasets_typed, key=lambda x: x.alignment_pairs or 0, reverse=True)
print("")
print("┌──────────────────────────────┐")
print("│ OPUS - https://opus.nlpl.eu/ │")
print("└──────────────────────────────┘")
print_table(
[
[
"Dataset",
"Code",
"Sentences",
"Size",
"URL",
],
*[
[
dataset.corpus,
dataset.name(),
dataset.alignment_pairs,
dataset.humanize_size(),
dataset.url if download_url else dataset.website_url(),
]
for dataset in datasets_typed
],
]
)
names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets["corpora"]]
print_yaml(names, exclude=["OPUS100v", "WMT-News"])
def get_sacrebleu(source: str, target: str):
import sacrebleu
entries = [
(name, entry)
for name, entry in sacrebleu.DATASETS.items()
if f"{source}-{target}" in entry or f"{target}-{source}" in entry
]
names = [f"sacrebleu_{name}" for name, entry in entries]
print("")
print("┌─────────────────────────────────────────────────┐")
print("│ sacrebleu - https://github.com/mjpost/sacrebleu │")
print("└─────────────────────────────────────────────────┘")
print_table(
[
["Dataset", "Description", "URLs"],
*[
[
#
name,
entry["description"],
", ".join(entry["data"]),
]
for name, entry in entries
],
]
)
print_yaml(names)
def get_remote_file_size(url: str) -> Optional[int]:
try:
response = requests.head(url, timeout=1)
if response.status_code == 200:
return humanize.naturalsize(int(response.headers.get("Content-Length", 0)))
else:
print(f"Failed to retrieve file information. Status code: {response.status_code}")
return None
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
T = TypeVar("T")
def exclude_by_name(excludes: list[str], names: list[str], entries: list[T]) -> list[T]:
"""Exclude entries by an excludes list, and a name list."""
filtered_entries = []
for name, entry in zip(names, entries):
filter = False
for exclude in excludes:
if exclude.lower() in name.lower():
filter = True
break
if not filter:
filtered_entries.append(entry)
return filtered_entries
def get_mtdata(source: str, target: str):
# mtdata outputs debug logs
logging.disable(logging.CRITICAL)
from mtdata.entry import lang_pair
from mtdata.index import get_entries
from mtdata.iso import iso3_code
source_tricode = iso3_code(source, fail_error=True)
target_tricode = iso3_code(target, fail_error=True)
entries = sorted(
get_entries(lang_pair(source_tricode + "-" + target_tricode), None, None, True),
key=lambda entry: entry.did.group,
)
excludes = ["opus", "newstest", "UNv1"]
def get_name(entry):
return (
f"mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}"
)
names = [get_name(entry) for entry in entries]
print("")
print("┌────────────────────────────────────────────────┐")
print("│ mtdata - https://github.com/thammegowda/mtdata │")
print("└────────────────────────────────────────────────┘")
print_table(
[
[
"Dataset",
"URL",
# "Size",
],
*[
[
#
get_name(entry),
entry.url,
# get_remote_file_size(entry.url),
]
for entry in
# Filter out the excludes
exclude_by_name(excludes, names, entries)
],
]
)
print_yaml(names, exclude=excludes)
def print_yaml(names: list[str], exclude: list[str] = []):
cleaned = set()
for name in names:
filter = False
for ex in exclude:
if ex.lower() in name.lower():
filter = True
break
if not filter:
cleaned.add(name)
print("\nYAML:")
if len(cleaned) == 0:
print("(no datasets)\n")
else:
print("\n".join(sorted([f" - {name}" for name in cleaned])))
def run(source: str, target: str, importer: Optional[str]):
if importer == "opus" or not type:
get_opus(source, target)
if importer == "sacrebleu" or not type:
get_sacrebleu(source, target)
if importer == "mtdata" or not type:
get_mtdata(source, target)
def print_table(table: list[list[any]]):
"""
Nicely print a table, the first row is the header
"""
# Compute the column lengths.
transposed_table = list(map(list, zip(*table)))
column_lengths = [max(len(str(x)) for x in column) for column in transposed_table]
print("")
for index, row in enumerate(table):
# Print the row.
for datum, max_len in zip(row, column_lengths):
print(str(datum).ljust(max_len), end=" ")
print("")
# Print a separator between the header and the rest of the table.
if index == 0:
for length in column_lengths:
print("".ljust(length, ""), end=" ")
print("")
if len(table) == 1:
print("(no datasets)")
def main(args: Optional[list[str]] = None) -> None:
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawTextHelpFormatter, # Preserves whitespace in the help text.
)
parser.add_argument("source", type=str, nargs="?", help="Source language code")
parser.add_argument("target", type=str, nargs="?", help="Target language code")
parser.add_argument(
"--importer", type=str, help="The importer to use: mtdata, opus, sacrebleu"
)
parser.add_argument(
"--download_url",
action="store_true",
default=False,
help="Show the download url if available.",
)
args = parser.parse_args(args)
if not args.source or not args.target:
parser.print_help()
sys.exit(1)
if args.importer and args.importer not in ["opus", "sacrebleu", "mtdata"]:
print(f'"{args.importer}" is not a valid importer.')
sys.exit(1)
if args.importer == "opus" or not args.importer:
get_opus(args.source, args.target, args.download_url)
if args.importer == "sacrebleu" or not args.importer:
get_sacrebleu(args.source, args.target)
if args.importer == "mtdata" or not args.importer:
get_mtdata(args.source, args.target)
if __name__ == "__main__":
main()