Update the find corpus tool to provide more information (#280)
* Add pytest-clarity for better text diffs in tests * Add requests_mock for tests * Add the test_data artifact to the .gitignore * Use an underscore with find_corpus.py * Update the find corpus tool to provide more information * Add humanize to the dependency list
This commit is contained in:
Родитель
ae781439d0
Коммит
d1be2bca4a
|
@ -136,3 +136,5 @@ dmypy.json
|
|||
.models
|
||||
.bin
|
||||
.snakemake
|
||||
|
||||
tests_data
|
||||
|
|
4
Makefile
4
Makefile
|
@ -164,8 +164,8 @@ fix-all:
|
|||
|
||||
# Run unit tests
|
||||
run-tests:
|
||||
poetry install --only tests
|
||||
PYTHONPATH=$$(pwd) poetry run pytest tests
|
||||
poetry install --only tests --only utils
|
||||
PYTHONPATH=$$(pwd) poetry run pytest tests -vv
|
||||
|
||||
# Validates Taskcluster task graph locally
|
||||
validate-taskgraph:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "absl-py"
|
||||
|
@ -1021,6 +1021,20 @@ files = [
|
|||
[package.dependencies]
|
||||
pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""}
|
||||
|
||||
[[package]]
|
||||
name = "humanize"
|
||||
version = "4.9.0"
|
||||
description = "Python humanize utilities"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "humanize-4.9.0-py3-none-any.whl", hash = "sha256:ce284a76d5b1377fd8836733b983bfb0b76f1aa1c090de2566fcf008d7f6ab16"},
|
||||
{file = "humanize-4.9.0.tar.gz", hash = "sha256:582a265c931c683a7e9b8ed9559089dea7edcf6cc95be39a3cbc2c5d5ac2bcfa"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
tests = ["freezegun", "pytest", "pytest-cov"]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.4"
|
||||
|
@ -1189,6 +1203,30 @@ files = [
|
|||
docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"]
|
||||
testing = ["coverage", "pyyaml"]
|
||||
|
||||
[[package]]
|
||||
name = "markdown-it-py"
|
||||
version = "3.0.0"
|
||||
description = "Python port of markdown-it. Markdown parsing, done right!"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
|
||||
{file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
mdurl = ">=0.1,<1.0"
|
||||
|
||||
[package.extras]
|
||||
benchmarking = ["psutil", "pytest", "pytest-benchmark"]
|
||||
code-style = ["pre-commit (>=3.0,<4.0)"]
|
||||
compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
|
||||
linkify = ["linkify-it-py (>=1,<3)"]
|
||||
plugins = ["mdit-py-plugins"]
|
||||
profiling = ["gprof2dot"]
|
||||
rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
|
||||
testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
|
||||
|
||||
[[package]]
|
||||
name = "markupsafe"
|
||||
version = "2.1.3"
|
||||
|
@ -1258,6 +1296,17 @@ files = [
|
|||
{file = "MarkupSafe-2.1.3.tar.gz", hash = "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mdurl"
|
||||
version = "0.1.2"
|
||||
description = "Markdown URL utilities"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
|
||||
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mohawk"
|
||||
version = "1.1.0"
|
||||
|
@ -1720,6 +1769,17 @@ docs = ["sphinx (>=1.7.1)"]
|
|||
redis = ["redis"]
|
||||
tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-flake8 (>=1.0.5)", "pytest-mypy (>=0.8.0)", "redis", "sphinx (>=3.0.3)"]
|
||||
|
||||
[[package]]
|
||||
name = "pprintpp"
|
||||
version = "0.4.0"
|
||||
description = "A drop-in replacement for pprint that's actually pretty"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pprintpp-0.4.0-py2.py3-none-any.whl", hash = "sha256:b6b4dcdd0c0c0d75e4d7b2f21a9e933e5b2ce62b26e1a54537f9651ae5a5c01d"},
|
||||
{file = "pprintpp-0.4.0.tar.gz", hash = "sha256:ea826108e2c7f49dc6d66c752973c3fc9749142a798d6b254e1e301cfdbc6403"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prefixed"
|
||||
version = "0.7.0"
|
||||
|
@ -2008,6 +2068,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
|
|||
[package.extras]
|
||||
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-clarity"
|
||||
version = "1.0.1"
|
||||
description = "A plugin providing an alternative, colourful diff output for failing assertions."
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
files = [
|
||||
{file = "pytest-clarity-1.0.1.tar.gz", hash = "sha256:505fe345fad4fe11c6a4187fe683f2c7c52c077caa1e135f3e483fe112db7772"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pprintpp = ">=0.4.0"
|
||||
pytest = ">=3.5.0"
|
||||
rich = ">=8.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.8.2"
|
||||
|
@ -2068,7 +2143,6 @@ files = [
|
|||
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
|
||||
|
@ -2076,15 +2150,8 @@ files = [
|
|||
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
|
||||
{file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
|
||||
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
|
||||
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
|
||||
|
@ -2101,7 +2168,6 @@ files = [
|
|||
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
|
||||
|
@ -2109,7 +2175,6 @@ files = [
|
|||
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
|
||||
{file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
|
||||
|
@ -2234,6 +2299,25 @@ urllib3 = ">=1.21.1,<1.27"
|
|||
socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
|
||||
use-chardet-on-py3 = ["chardet (>=3.0.2,<5)"]
|
||||
|
||||
[[package]]
|
||||
name = "requests-mock"
|
||||
version = "1.11.0"
|
||||
description = "Mock out responses from the requests package"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "requests-mock-1.11.0.tar.gz", hash = "sha256:ef10b572b489a5f28e09b708697208c4a3b2b89ef80a9f01584340ea357ec3c4"},
|
||||
{file = "requests_mock-1.11.0-py2.py3-none-any.whl", hash = "sha256:f7fae383f228633f6bececebdab236c478ace2284d6292c6e7e2867b9ab74d15"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
requests = ">=2.3,<3"
|
||||
six = "*"
|
||||
|
||||
[package.extras]
|
||||
fixture = ["fixtures"]
|
||||
test = ["fixtures", "mock", "purl", "pytest", "requests-futures", "sphinx", "testtools"]
|
||||
|
||||
[[package]]
|
||||
name = "requests-oauthlib"
|
||||
version = "1.3.1"
|
||||
|
@ -2252,6 +2336,24 @@ requests = ">=2.0.0"
|
|||
[package.extras]
|
||||
rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "rich"
|
||||
version = "13.7.0"
|
||||
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
files = [
|
||||
{file = "rich-13.7.0-py3-none-any.whl", hash = "sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"},
|
||||
{file = "rich-13.7.0.tar.gz", hash = "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
markdown-it-py = ">=2.2.0"
|
||||
pygments = ">=2.13.0,<3.0.0"
|
||||
|
||||
[package.extras]
|
||||
jupyter = ["ipywidgets (>=7.5.1,<9)"]
|
||||
|
||||
[[package]]
|
||||
name = "rsa"
|
||||
version = "4.9"
|
||||
|
@ -2293,24 +2395,24 @@ python-versions = ">=3.6"
|
|||
files = [
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b42169467c42b692c19cf539c38d4602069d8c1505e97b86387fcf7afb766e1d"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:07238db9cbdf8fc1e9de2489a4f68474e70dffcb32232db7c08fa61ca0c7c462"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:d92f81886165cb14d7b067ef37e142256f1c6a90a65cd156b063a43da1708cfd"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fff3573c2db359f091e1589c3d7c5fc2f86f5bdb6f24252c2d8e539d4e45f412"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:aa2267c6a303eb483de8d02db2871afb5c5fc15618d894300b88958f729ad74f"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:840f0c7f194986a63d2c2465ca63af8ccbbc90ab1c6001b1978f05119b5e7334"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:024cfe1fc7c7f4e1aff4a81e718109e13409767e4f871443cbff3dba3578203d"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-win32.whl", hash = "sha256:c69212f63169ec1cfc9bb44723bf2917cbbd8f6191a00ef3410f5a7fe300722d"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp310-cp310-win_amd64.whl", hash = "sha256:cabddb8d8ead485e255fe80429f833172b4cadf99274db39abc080e068cbcc31"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:bef08cd86169d9eafb3ccb0a39edb11d8e25f3dae2b28f5c52fd997521133069"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:b16420e621d26fdfa949a8b4b47ade8810c56002f5389970db4ddda51dbff248"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:b5edda50e5e9e15e54a6a8a0070302b00c518a9d32accc2346ad6c984aacd279"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:25c515e350e5b739842fc3228d662413ef28f295791af5e5110b543cf0b57d9b"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux_2_24_aarch64.whl", hash = "sha256:1707814f0d9791df063f8c19bb51b0d1278b8e9a2353abbb676c2f685dee6afe"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:46d378daaac94f454b3a0e3d8d78cafd78a026b1d71443f4966c696b48a6d899"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:09b055c05697b38ecacb7ac50bdab2240bfca1a0c4872b0fd309bb07dc9aa3a9"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-win32.whl", hash = "sha256:53a300ed9cea38cf5a2a9b069058137c2ca1ce658a874b79baceb8f892f915a7"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp311-cp311-win_amd64.whl", hash = "sha256:c2a72e9109ea74e511e29032f3b670835f8a59bbdc9ce692c5b4ed91ccf1eedb"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ebc06178e8821efc9692ea7544aa5644217358490145629914d8020042c24aa1"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:edaef1c1200c4b4cb914583150dcaa3bc30e592e907c01117c08b13a07255ec2"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:7048c338b6c86627afb27faecf418768acb6331fc24cfa56c93e8c9780f815fa"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d176b57452ab5b7028ac47e7b3cf644bcfdc8cacfecf7e71759f7f51a59e5c92"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux_2_24_aarch64.whl", hash = "sha256:1dc67314e7e1086c9fdf2680b7b6c2be1c0d8e3a8279f2e993ca2a7545fecf62"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3213ece08ea033eb159ac52ae052a4899b56ecc124bb80020d9bbceeb50258e9"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aab7fd643f71d7946f2ee58cc88c9b7bfc97debd71dcc93e03e2d174628e7e2d"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp312-cp312-win32.whl", hash = "sha256:5c365d91c88390c8d0a8545df0b5857172824b1c604e867161e6b3d59a827eaa"},
|
||||
|
@ -2318,7 +2420,7 @@ files = [
|
|||
{file = "ruamel.yaml.clib-0.2.8-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a5aa27bad2bb83670b71683aae140a1f52b0857a2deff56ad3f6c13a017a26ed"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c58ecd827313af6864893e7af0a3bb85fd529f862b6adbefe14643947cfe2942"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-macosx_12_0_arm64.whl", hash = "sha256:f481f16baec5290e45aebdc2a5168ebc6d35189ae6fea7a58787613a25f6e875"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3fcc54cb0c8b811ff66082de1680b4b14cf8a81dce0d4fbf665c2265a81e07a1"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:77159f5d5b5c14f7c34073862a6b7d34944075d9f93e681638f6d753606c6ce6"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7f67a1ee819dc4562d444bbafb135832b0b909f81cc90f7aa00260968c9ca1b3"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4ecbf9c3e19f9562c7fdd462e8d18dd902a47ca046a2e64dba80699f0b6c09b7"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:87ea5ff66d8064301a154b3933ae406b0863402a799b16e4a1d24d9fbbcbe0d3"},
|
||||
|
@ -2326,7 +2428,7 @@ files = [
|
|||
{file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-win_amd64.whl", hash = "sha256:3f215c5daf6a9d7bbed4a0a4f760f3113b10e82ff4c5c44bec20a68c8014f675"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1b617618914cb00bf5c34d4357c37aa15183fa229b24767259657746c9077615"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a6a9ffd280b71ad062eae53ac1659ad86a17f59a0fdc7699fd9be40525153337"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:665f58bfd29b167039f714c6998178d27ccd83984084c286110ef26b230f259f"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:305889baa4043a09e5b76f8e2a51d4ffba44259f6b4c72dec8ca56207d9c6fe1"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:700e4ebb569e59e16a976857c8798aee258dceac7c7d6b50cab63e080058df91"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e2b4c44b60eadec492926a7270abb100ef9f72798e18743939bdbf037aab8c28"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e79e5db08739731b0ce4850bed599235d601701d5694c36570a99a0c5ca41a9d"},
|
||||
|
@ -2334,7 +2436,7 @@ files = [
|
|||
{file = "ruamel.yaml.clib-0.2.8-cp38-cp38-win_amd64.whl", hash = "sha256:56f4252222c067b4ce51ae12cbac231bce32aee1d33fbfc9d17e5b8d6966c312"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:03d1162b6d1df1caa3a4bd27aa51ce17c9afc2046c31b0ad60a0a96ec22f8001"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:bba64af9fa9cebe325a62fa398760f5c7206b215201b0ec825005f1b18b9bccf"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:9eb5dee2772b0f704ca2e45b1713e4e5198c18f515b52743576d196348f374d3"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:a1a45e0bb052edf6a1d3a93baef85319733a888363938e1fc9924cb00c8df24c"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:da09ad1c359a728e112d60116f626cc9f29730ff3e0e7db72b9a2dbc2e4beed5"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:184565012b60405d93838167f425713180b949e9d8dd0bbc7b49f074407c5a8b"},
|
||||
{file = "ruamel.yaml.clib-0.2.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a75879bacf2c987c003368cf14bed0ffe99e8e85acfa6c0bfffc21a090f16880"},
|
||||
|
@ -2995,4 +3097,4 @@ multidict = ">=4.0"
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "3e4e51ed7309819903d851969d04f5c12c43f63aafa46b8a5983512a40d20bf7"
|
||||
content-hash = "75996c59b50c2ad361524908ad9e6cd88a21b782e6d99adaa39fa4bf57ed70b4"
|
||||
|
|
|
@ -29,6 +29,7 @@ marian-tensorboard = "^0.2.1"
|
|||
sacrebleu="2.0.0"
|
||||
mtdata="0.3.2"
|
||||
requests="2.26.0"
|
||||
humanize = "^4.9.0"
|
||||
|
||||
[tool.poetry.group.tests.dependencies]
|
||||
sacrebleu="2.0.0"
|
||||
|
@ -37,6 +38,8 @@ requests="2.26.0"
|
|||
pytest="7.4.3"
|
||||
# use the latest main, switch to PyPi when released
|
||||
opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="9133e1525c7ee37f53ea14ee6a180152bf7ea192"}
|
||||
pytest-clarity = "^1.0.1"
|
||||
requests-mock = "^1.11.0"
|
||||
|
||||
[tool.black]
|
||||
extend-exclude= "/3rd_party"
|
||||
|
|
|
@ -0,0 +1,154 @@
|
|||
from textwrap import dedent
|
||||
|
||||
import pytest
|
||||
|
||||
from utils.find_corpus import main as find_corpus
|
||||
|
||||
"""
|
||||
Tests the `utils/find_corpus.py` script.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_opus_data(requests_mock):
|
||||
"""
|
||||
Provide a simplistic response from opus, with only 2 entries.
|
||||
"""
|
||||
requests_mock.get(
|
||||
"https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest",
|
||||
text="""{
|
||||
"corpora": [
|
||||
{
|
||||
"alignment_pairs": 4605,
|
||||
"corpus": "Books",
|
||||
"documents": "",
|
||||
"id": 31736,
|
||||
"latest": "True",
|
||||
"preprocessing": "moses",
|
||||
"size": 328,
|
||||
"source": "ca",
|
||||
"source_tokens": 73463,
|
||||
"target": "en",
|
||||
"target_tokens": 68625,
|
||||
"url": "https://object.pouta.csc.fi/OPUS-Books/v1/moses/ca-en.txt.zip",
|
||||
"version": "v1"
|
||||
},
|
||||
{
|
||||
"alignment_pairs": 5802549,
|
||||
"corpus": "CCAligned",
|
||||
"documents": "",
|
||||
"id": 32571,
|
||||
"latest": "True",
|
||||
"preprocessing": "moses",
|
||||
"size": 522860,
|
||||
"source": "ca",
|
||||
"source_tokens": 89704109,
|
||||
"target": "en",
|
||||
"target_tokens": 84373417,
|
||||
"url": "https://object.pouta.csc.fi/OPUS-CCAligned/v1/moses/ca-en.txt.zip",
|
||||
"version": "v1"
|
||||
}
|
||||
]
|
||||
}""",
|
||||
)
|
||||
|
||||
|
||||
def assert_stdout(capsys, message: str, expected_output: str):
|
||||
"""
|
||||
Asserts the output from stdout matches a certain string.
|
||||
"""
|
||||
captured = capsys.readouterr()
|
||||
|
||||
def clean_text(text):
|
||||
text = dedent(text).strip()
|
||||
result = ""
|
||||
for line in text.split("\n"):
|
||||
result += line.strip() + "\n"
|
||||
return result
|
||||
|
||||
assert clean_text(captured.out) == clean_text(expected_output), message
|
||||
|
||||
|
||||
def test_opus(mock_opus_data, capsys):
|
||||
find_corpus(["en", "ca", "--importer", "opus"])
|
||||
assert_stdout(
|
||||
capsys,
|
||||
"The opus dataset outputs nicely.",
|
||||
"""
|
||||
Fetching datasets from:
|
||||
https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest
|
||||
|
||||
|
||||
┌──────────────────────────────┐
|
||||
│ OPUS - https://opus.nlpl.eu/ │
|
||||
└──────────────────────────────┘
|
||||
|
||||
Dataset Code Sentences Size URL
|
||||
───────── ───────────────── ───────── ──────── ─────────────────────────────────────
|
||||
CCAligned opus_CCAligned/v1 5802549 535.4 MB https://opus.nlpl.eu/CCAligned-v1.php
|
||||
Books opus_Books/v1 4605 335.9 kB https://opus.nlpl.eu/Books-v1.php
|
||||
|
||||
YAML:
|
||||
- opus_Books/v1
|
||||
- opus_CCAligned/v1
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
def test_opus_download_url(mock_opus_data, capsys):
|
||||
"""
|
||||
This checks that the download URLs are shown instead of the information URLs.
|
||||
"""
|
||||
find_corpus(["en", "ca", "--importer", "opus", "--download_url"])
|
||||
output = capsys.readouterr()
|
||||
assert "https://object.pouta.csc.fi/OPUS-CCAligned/v1/moses/ca-en.txt.zip" in output.out
|
||||
assert "https://object.pouta.csc.fi/OPUS-Books/v1/moses/ca-en.txt.zip" in output.out
|
||||
|
||||
|
||||
# mtdata has some deprecated dependencies
|
||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||
def test_mtdata(requests_mock, capsys):
|
||||
find_corpus(["en", "ca", "--importer", "mtdata"])
|
||||
assert_stdout(
|
||||
capsys,
|
||||
"mtdata outputs nicely",
|
||||
"""
|
||||
┌────────────────────────────────────────────────┐
|
||||
│ mtdata - https://github.com/thammegowda/mtdata │
|
||||
└────────────────────────────────────────────────┘
|
||||
|
||||
Dataset URL
|
||||
────────────────────────────────────── ───────────────────────────────────────────────────────────────────────────────────────────────────────────
|
||||
mtdata_ELRC-wikipedia_health-1-cat-eng https://elrc-share.eu/repository/download/ac6d557e8de811ea913100155d026706b0c5fee96b88489781ddd7675f8ea2ae/
|
||||
mtdata_Facebook-wikimatrix-1-cat-eng https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.ca-en.tsv.gz
|
||||
mtdata_Statmt-ccaligned-1-cat_ES-eng http://www.statmt.org/cc-aligned/sentence-aligned/ca_ES-en_XX.tsv.xz
|
||||
|
||||
YAML:
|
||||
- mtdata_ELRC-wikipedia_health-1-cat-eng
|
||||
- mtdata_Facebook-wikimatrix-1-cat-eng
|
||||
- mtdata_Statmt-ccaligned-1-cat_ES-eng
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
def test_sacrebleu(requests_mock, capsys):
|
||||
# "iu" is the Inuktitut language, which has a small dataset available.
|
||||
find_corpus(["en", "iu", "--importer", "sacrebleu"])
|
||||
assert_stdout(
|
||||
capsys,
|
||||
"sacrebleu outputs nicely",
|
||||
"""
|
||||
┌─────────────────────────────────────────────────┐
|
||||
│ sacrebleu - https://github.com/mjpost/sacrebleu │
|
||||
└─────────────────────────────────────────────────┘
|
||||
|
||||
Dataset Description URLs
|
||||
───────── ─────────────────────────────────────── ──────────────────────────────────────────────────────
|
||||
wmt20 Official evaluation data for WMT20 http://data.statmt.org/wmt20/translation-task/test.tgz
|
||||
wmt20/dev Development data for tasks new to 2020. http://data.statmt.org/wmt20/translation-task/dev.tgz
|
||||
|
||||
YAML:
|
||||
- sacrebleu_wmt20
|
||||
- sacrebleu_wmt20/dev
|
||||
""",
|
||||
)
|
|
@ -1,70 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Finds all opus datasets for a language pair and prints them to set config settings.
|
||||
|
||||
Usage:
|
||||
python find-corpus.py <src> <trg> <importer>
|
||||
|
||||
Params:
|
||||
src - source language code
|
||||
trg - target language code
|
||||
importer - importer type (mtdata, opus, sacrebleu)
|
||||
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
import requests
|
||||
|
||||
source = sys.argv[1]
|
||||
target = sys.argv[2]
|
||||
type = sys.argv[3]
|
||||
|
||||
# exclude = ['bible', 'Ubuntu', 'Gnome', 'KDE', 'Multi', 'OPUS100v']
|
||||
exclude = []
|
||||
names = []
|
||||
|
||||
if type == "opus":
|
||||
exclude += ["OPUS100v", "WMT-News"]
|
||||
datasets = requests.get(
|
||||
f"https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest"
|
||||
).json()
|
||||
names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets["corpora"]]
|
||||
elif type == "sacrebleu":
|
||||
import sacrebleu
|
||||
|
||||
names = [
|
||||
f"sacrebleu_{name}"
|
||||
for name, meta in sacrebleu.DATASETS.items()
|
||||
if f"{source}-{target}" in meta or f"{target}-{source}" in meta
|
||||
]
|
||||
elif type == "mtdata":
|
||||
from mtdata.entry import lang_pair
|
||||
from mtdata.index import get_entries
|
||||
from mtdata.iso import iso3_code
|
||||
|
||||
source_tricode = iso3_code(source, fail_error=True)
|
||||
target_tricode = iso3_code(target, fail_error=True)
|
||||
exclude += ["opus", "newstest", "UNv1"]
|
||||
entries = sorted(
|
||||
get_entries(lang_pair(source_tricode + "-" + target_tricode), None, None, True),
|
||||
key=lambda entry: entry.did.group,
|
||||
)
|
||||
names = [
|
||||
f"mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}"
|
||||
for entry in entries
|
||||
]
|
||||
else:
|
||||
print(f"Importer type {type} is unsupported. Supported importers: opus, mtdata, sacrebleu")
|
||||
|
||||
cleaned = set()
|
||||
for name in names:
|
||||
filter = False
|
||||
for ex in exclude:
|
||||
if ex.lower() in name.lower():
|
||||
filter = True
|
||||
break
|
||||
if not filter:
|
||||
cleaned.add(name)
|
||||
|
||||
print("\n".join(sorted([f" - {name}" for name in cleaned])))
|
|
@ -0,0 +1,308 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Finds all opus datasets for a language pair and prints them to set config settings.
|
||||
|
||||
Usage:
|
||||
poetry install --only utils
|
||||
poetry run ./utils/find_corpus.py "en" "ca"
|
||||
poetry run ./utils/find_corpus.py "en" "fr" --importer opus
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from typing import NamedTuple, Optional, TypeVar, Union
|
||||
|
||||
import humanize
|
||||
import requests
|
||||
|
||||
|
||||
class OpusDataset(NamedTuple):
|
||||
# The name of this dataset, e.g. "CCAligned"
|
||||
corpus: str
|
||||
# This is a blank string at the time of this writing.
|
||||
documents: str
|
||||
|
||||
# 'moses'
|
||||
preprocessing: str
|
||||
# The language tag.
|
||||
source: str
|
||||
# The language tag.
|
||||
target: str
|
||||
# The URL to the download
|
||||
url: str
|
||||
# For example "v1"
|
||||
version: str
|
||||
|
||||
alignment_pairs: int
|
||||
id: int
|
||||
# Size in KiB
|
||||
size: int
|
||||
source_tokens: int
|
||||
target_tokens: int
|
||||
|
||||
latest: Union["True", "False"]
|
||||
|
||||
def name(self) -> str:
|
||||
return f"opus_{self.corpus}/{self.version}"
|
||||
|
||||
def website_url(self) -> str:
|
||||
return f"https://opus.nlpl.eu/{self.corpus}-{self.version}.php"
|
||||
|
||||
def humanize_size(self) -> str:
|
||||
return humanize.naturalsize(self.size * 1024)
|
||||
|
||||
|
||||
def get_opus(source: str, target: str, download_url: bool):
|
||||
# This API is documented: https://opus.nlpl.eu/opusapi/
|
||||
url = f"https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest"
|
||||
|
||||
print(f"Fetching datasets from:\n{url}\n")
|
||||
|
||||
datasets = requests.get(url).json()
|
||||
|
||||
# Convert the response into a typed object that is sorted.
|
||||
datasets_typed = [OpusDataset(**corpus_data) for corpus_data in datasets.get("corpora", [])]
|
||||
datasets_typed = sorted(datasets_typed, key=lambda x: x.alignment_pairs or 0, reverse=True)
|
||||
|
||||
print("")
|
||||
print("┌──────────────────────────────┐")
|
||||
print("│ OPUS - https://opus.nlpl.eu/ │")
|
||||
print("└──────────────────────────────┘")
|
||||
|
||||
print_table(
|
||||
[
|
||||
[
|
||||
"Dataset",
|
||||
"Code",
|
||||
"Sentences",
|
||||
"Size",
|
||||
"URL",
|
||||
],
|
||||
*[
|
||||
[
|
||||
dataset.corpus,
|
||||
dataset.name(),
|
||||
dataset.alignment_pairs,
|
||||
dataset.humanize_size(),
|
||||
dataset.url if download_url else dataset.website_url(),
|
||||
]
|
||||
for dataset in datasets_typed
|
||||
],
|
||||
]
|
||||
)
|
||||
|
||||
names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets["corpora"]]
|
||||
print_yaml(names, exclude=["OPUS100v", "WMT-News"])
|
||||
|
||||
|
||||
def get_sacrebleu(source: str, target: str):
|
||||
import sacrebleu
|
||||
|
||||
entries = [
|
||||
(name, entry)
|
||||
for name, entry in sacrebleu.DATASETS.items()
|
||||
if f"{source}-{target}" in entry or f"{target}-{source}" in entry
|
||||
]
|
||||
|
||||
names = [f"sacrebleu_{name}" for name, entry in entries]
|
||||
|
||||
print("")
|
||||
print("┌─────────────────────────────────────────────────┐")
|
||||
print("│ sacrebleu - https://github.com/mjpost/sacrebleu │")
|
||||
print("└─────────────────────────────────────────────────┘")
|
||||
print_table(
|
||||
[
|
||||
["Dataset", "Description", "URLs"],
|
||||
*[
|
||||
[
|
||||
#
|
||||
name,
|
||||
entry["description"],
|
||||
", ".join(entry["data"]),
|
||||
]
|
||||
for name, entry in entries
|
||||
],
|
||||
]
|
||||
)
|
||||
print_yaml(names)
|
||||
|
||||
|
||||
def get_remote_file_size(url: str) -> Optional[int]:
|
||||
try:
|
||||
response = requests.head(url, timeout=1)
|
||||
|
||||
if response.status_code == 200:
|
||||
return humanize.naturalsize(int(response.headers.get("Content-Length", 0)))
|
||||
else:
|
||||
print(f"Failed to retrieve file information. Status code: {response.status_code}")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def exclude_by_name(excludes: list[str], names: list[str], entries: list[T]) -> list[T]:
|
||||
"""Exclude entries by an excludes list, and a name list."""
|
||||
filtered_entries = []
|
||||
for name, entry in zip(names, entries):
|
||||
filter = False
|
||||
for exclude in excludes:
|
||||
if exclude.lower() in name.lower():
|
||||
filter = True
|
||||
break
|
||||
|
||||
if not filter:
|
||||
filtered_entries.append(entry)
|
||||
|
||||
return filtered_entries
|
||||
|
||||
|
||||
def get_mtdata(source: str, target: str):
|
||||
# mtdata outputs debug logs
|
||||
logging.disable(logging.CRITICAL)
|
||||
|
||||
from mtdata.entry import lang_pair
|
||||
from mtdata.index import get_entries
|
||||
from mtdata.iso import iso3_code
|
||||
|
||||
source_tricode = iso3_code(source, fail_error=True)
|
||||
target_tricode = iso3_code(target, fail_error=True)
|
||||
entries = sorted(
|
||||
get_entries(lang_pair(source_tricode + "-" + target_tricode), None, None, True),
|
||||
key=lambda entry: entry.did.group,
|
||||
)
|
||||
excludes = ["opus", "newstest", "UNv1"]
|
||||
|
||||
def get_name(entry):
|
||||
return (
|
||||
f"mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}"
|
||||
)
|
||||
|
||||
names = [get_name(entry) for entry in entries]
|
||||
|
||||
print("")
|
||||
print("┌────────────────────────────────────────────────┐")
|
||||
print("│ mtdata - https://github.com/thammegowda/mtdata │")
|
||||
print("└────────────────────────────────────────────────┘")
|
||||
print_table(
|
||||
[
|
||||
[
|
||||
"Dataset",
|
||||
"URL",
|
||||
# "Size",
|
||||
],
|
||||
*[
|
||||
[
|
||||
#
|
||||
get_name(entry),
|
||||
entry.url,
|
||||
# get_remote_file_size(entry.url),
|
||||
]
|
||||
for entry in
|
||||
# Filter out the excludes
|
||||
exclude_by_name(excludes, names, entries)
|
||||
],
|
||||
]
|
||||
)
|
||||
|
||||
print_yaml(names, exclude=excludes)
|
||||
|
||||
|
||||
def print_yaml(names: list[str], exclude: list[str] = []):
|
||||
cleaned = set()
|
||||
for name in names:
|
||||
filter = False
|
||||
for ex in exclude:
|
||||
if ex.lower() in name.lower():
|
||||
filter = True
|
||||
break
|
||||
if not filter:
|
||||
cleaned.add(name)
|
||||
|
||||
print("\nYAML:")
|
||||
if len(cleaned) == 0:
|
||||
print("(no datasets)\n")
|
||||
else:
|
||||
print("\n".join(sorted([f" - {name}" for name in cleaned])))
|
||||
|
||||
|
||||
def run(source: str, target: str, importer: Optional[str]):
|
||||
if importer == "opus" or not type:
|
||||
get_opus(source, target)
|
||||
|
||||
if importer == "sacrebleu" or not type:
|
||||
get_sacrebleu(source, target)
|
||||
|
||||
if importer == "mtdata" or not type:
|
||||
get_mtdata(source, target)
|
||||
|
||||
|
||||
def print_table(table: list[list[any]]):
|
||||
"""
|
||||
Nicely print a table, the first row is the header
|
||||
"""
|
||||
|
||||
# Compute the column lengths.
|
||||
transposed_table = list(map(list, zip(*table)))
|
||||
column_lengths = [max(len(str(x)) for x in column) for column in transposed_table]
|
||||
|
||||
print("")
|
||||
for index, row in enumerate(table):
|
||||
# Print the row.
|
||||
for datum, max_len in zip(row, column_lengths):
|
||||
print(str(datum).ljust(max_len), end=" ")
|
||||
print("")
|
||||
|
||||
# Print a separator between the header and the rest of the table.
|
||||
if index == 0:
|
||||
for length in column_lengths:
|
||||
print("".ljust(length, "─"), end=" ")
|
||||
print("")
|
||||
|
||||
if len(table) == 1:
|
||||
print("(no datasets)")
|
||||
|
||||
|
||||
def main(args: Optional[list[str]] = None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__,
|
||||
formatter_class=argparse.RawTextHelpFormatter, # Preserves whitespace in the help text.
|
||||
)
|
||||
parser.add_argument("source", type=str, nargs="?", help="Source language code")
|
||||
parser.add_argument("target", type=str, nargs="?", help="Target language code")
|
||||
parser.add_argument(
|
||||
"--importer", type=str, help="The importer to use: mtdata, opus, sacrebleu"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--download_url",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Show the download url if available.",
|
||||
)
|
||||
|
||||
args = parser.parse_args(args)
|
||||
|
||||
if not args.source or not args.target:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
if args.importer and args.importer not in ["opus", "sacrebleu", "mtdata"]:
|
||||
print(f'"{args.importer}" is not a valid importer.')
|
||||
sys.exit(1)
|
||||
|
||||
if args.importer == "opus" or not args.importer:
|
||||
get_opus(args.source, args.target, args.download_url)
|
||||
|
||||
if args.importer == "sacrebleu" or not args.importer:
|
||||
get_sacrebleu(args.source, args.target)
|
||||
|
||||
if args.importer == "mtdata" or not args.importer:
|
||||
get_mtdata(args.source, args.target)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Загрузка…
Ссылка в новой задаче