From 733ced6d8d6486ab540481bd9e585e9f2353770a Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Thu, 18 Jan 2018 12:26:21 +0200 Subject: [PATCH] Bug 1431356 - Update encoding_rs to 0.7.2 and simd to 0.2.1. r=emk. MozReview-Commit-ID: Lp3zyF2rLxN --HG-- extra : rebase_source : 81b515206ca5d28623cbaead16244ef258da2088 --- .../rust/encoding_rs/.cargo-checksum.json | 2 +- third_party/rust/encoding_rs/CONTRIBUTING.md | 4 +- third_party/rust/encoding_rs/Cargo.toml | 14 +- third_party/rust/encoding_rs/README.md | 37 +- third_party/rust/encoding_rs/src/ascii.rs | 382 ++- third_party/rust/encoding_rs/src/gb18030.rs | 14 +- third_party/rust/encoding_rs/src/lib.rs | 58 +- third_party/rust/encoding_rs/src/mem.rs | 2873 +++++++++++++++++ .../rust/encoding_rs/src/simd_funcs.rs | 159 +- third_party/rust/encoding_rs/src/testing.rs | 129 +- third_party/rust/encoding_rs/src/utf_8.rs | 32 +- third_party/rust/simd/.cargo-checksum.json | 2 +- third_party/rust/simd/Cargo.toml | 43 +- third_party/rust/simd/examples/mandelbrot.rs | 2 +- .../rust/simd/examples/matrix-inverse.rs | 1 + .../rust/simd/examples/nbody-nosimd.rs | 2 +- third_party/rust/simd/examples/ops.rs | 1 + third_party/rust/simd/src/aarch64/neon.rs | 2 +- third_party/rust/simd/src/arm/neon.rs | 2 +- third_party/rust/simd/src/common.rs | 3 +- third_party/rust/simd/src/lib.rs | 17 +- third_party/rust/simd/src/sixty_four.rs | 3 +- third_party/rust/simd/src/v256.rs | 16 +- third_party/rust/simd/src/x86/avx.rs | 2 +- third_party/rust/simd/src/x86/avx2.rs | 2 +- third_party/rust/simd/src/x86/sse2.rs | 2 +- toolkit/library/gtest/rust/Cargo.lock | 14 +- toolkit/library/rust/Cargo.lock | 14 +- 28 files changed, 3601 insertions(+), 231 deletions(-) create mode 100644 third_party/rust/encoding_rs/src/mem.rs diff --git a/third_party/rust/encoding_rs/.cargo-checksum.json b/third_party/rust/encoding_rs/.cargo-checksum.json index 29396d9f3a9e..fccc538b34d0 100644 --- a/third_party/rust/encoding_rs/.cargo-checksum.json +++ b/third_party/rust/encoding_rs/.cargo-checksum.json @@ -1 +1 @@ -{"files":{".travis.yml":"dc509cc3b8f44fbdf1d806f533c3f005afaf0fd77cd266b38cb69bab3e4ea136","CONTRIBUTING.md":"e4ffa92c979c7e6ca7b676842a708ea05b84181327fcde43dfcd8038b678a057","COPYRIGHT":"20d4fff11cca11529df3f02096fbe8ffe350219cdb07cdedea34e6a762866da5","Cargo.toml":"2bed851f8857df3daf0cef25b3588a0841241624ab326e81cce188a598395352","Ideas.md":"7fbeddb0f8ba7b233673ee705997adc2fddb1636a17fe662532b35ef2810a51d","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"74aa8b6d04c36bb640ee81187a3f24a2fa94e36d4c1d4f2ca164c3784ae87a83","README.md":"cf09a31640f5d556661e2fbe1d07f76046eff94daf6ebb895d14499653b59bde","generate-encoding-data.py":"8a0a5162098d355e4df63532819769fd6626a66a0aa93f2762e315d6147aa0a5","rustfmt.toml":"c01c06dfbdfcf30730535aab911d69068febb921e2faef9571ceeb6a5c2a3eab","src/ascii.rs":"1e9f9a02130933fdba6b7606b47c2308afd6d16df064779245226060211af0ce","src/big5.rs":"780ae537353f899a5772a9e7d062441041276e1eb1506a013e4280c5cda6bb93","src/data.rs":"412c842c698c3ce1cec4a27ab19ca275372ac28940ac49cdf3e0dad71a2c2812","src/euc_jp.rs":"feda0ade5e1c3e4abd7637c59373b977662007990fd164ea7db1acc502ba3534","src/euc_kr.rs":"2699c52055882e34ba4e12d072b8161c635840f3620075ca3f10986aec0e8d3b","src/gb18030.rs":"aa9de27a41715dfb02a3b9161d86e3775f635f625f70d3abaadcd583ee7022c0","src/handles.rs":"c07a3738e43e8aae11108a30d34067c31ddc5d22074b85ef393f00abcc1f4e01","src/iso_2022_jp.rs":"1f780c3ff72f1a867d6c5782135cd01427eca6d74f0dd6cb23c1406b5163af1a","src/lib.rs":"250cabe96d561b38eef9e26141707904b66b612007098287dd2b245240c5a5be","src/macros.rs":"9ab30e7194f61f268cd7d899cabb06ff9ca7717663926fd583b20334f49ac8d3","src/replacement.rs":"782f03f04d110e9a0656262bf4296aa0ab8199e196cb63239c30d9649996caa4","src/shift_jis.rs":"84df4ff58b60e0827d6c0c7049f2cf19033f2b9e25a9186bcfb0bbb05e87b380","src/simd_funcs.rs":"6c5beb75d30c1b3a2e6e9dd86209f9748313ee75f5b43a9d7f5176be310ffabb","src/single_byte.rs":"b3fadb4fa1e66b00efc12b8850b3076580a8cd73c9ed810a19421fd3ade9bbf1","src/test_data/big5_in.txt":"4c5a8691f8dc717311889c63894026d2fb62725a86c4208ca274a9cc8d42a503","src/test_data/big5_in_ref.txt":"99d399e17750cf9c7cf30bb253dbfe35b81c4fcbdead93cfa48b1429213473c7","src/test_data/big5_out.txt":"6193ca97c297aa20e09396038d18e938bb7ea331c26f0f2454097296723a0b13","src/test_data/big5_out_ref.txt":"36567691f557df144f6cc520015a87038dfa156f296fcf103b56ae9a718be1fc","src/test_data/euc_kr_in.txt":"c86a7224f3215fa0d04e685622a752fdc72763e8ae076230c7fd62de57ec4074","src/test_data/euc_kr_in_ref.txt":"1f419f4ca47d708b54c73c461545a022ae2e20498fdbf8005a483d752a204883","src/test_data/euc_kr_out.txt":"e7f32e026f70be1e1b58e0047baf7d3d2c520269c4f9b9992e158b4decb0a1a3","src/test_data/euc_kr_out_ref.txt":"c9907857980b20b8e9e3b584482ed6567a2be6185d72237b6322f0404944924e","src/test_data/gb18030_in.txt":"ab7231b2d3e9afacdbd7d7f3b9e5361a7ff9f7e1cfdb4f3bd905b9362b309e53","src/test_data/gb18030_in_ref.txt":"dc5069421adca2043c55f5012b55a76fdff651d22e6e699fd0978f8d5706815c","src/test_data/gb18030_out.txt":"f0208d527f5ca63de7d9a0323be8d5cf12d8a104b2943d92c2701f0c3364dac1","src/test_data/gb18030_out_ref.txt":"6819fe47627e4ea01027003fc514b9f21a1322e732d7f1fb92cc6c5455bc6c07","src/test_data/iso_2022_jp_in.txt":"cd24bbdcb1834e25db54646fbf4c41560a13dc7540f6be3dba4f5d97d44513af","src/test_data/iso_2022_jp_in_ref.txt":"3dc4e6a5e06471942d086b16c9440945e78415f6f3f47e43717e4bc2eac2cdf5","src/test_data/iso_2022_jp_out.txt":"9b6f015329dda6c3f9ee5ce6dbd6fa9c89acc21283e886836c78b8d833480c21","src/test_data/iso_2022_jp_out_ref.txt":"78cb260093a20116ad9a42f43b05d1848c5ab100b6b9a850749809e943884b35","src/test_data/jis0208_in.txt":"6df3030553ffb0a6615bb33dc8ea9dca6d9623a9028e2ffec754ce3c3da824cc","src/test_data/jis0208_in_ref.txt":"3dc4e6a5e06471942d086b16c9440945e78415f6f3f47e43717e4bc2eac2cdf5","src/test_data/jis0208_out.txt":"4ec24477e1675ce750733bdc3c5add1cd27b6bd4ce1f09289564646e9654e857","src/test_data/jis0208_out_ref.txt":"c3e1cef5032b2b1d93a406f31ff940c4e2dfe8859b8b17ca2761fee7a75a0e48","src/test_data/jis0212_in.txt":"c011f0dd72bd7c8cd922df9374ef8d2769a77190514c77f6c62b415852eeb9fe","src/test_data/jis0212_in_ref.txt":"7d9458b3d2f73e7092a7f505c08ce1d233dde18aa679fbcf9889256239cc9e06","src/test_data/shift_jis_in.txt":"02e389ccef0dd2122e63f503899402cb7f797912c2444cc80ab93131116c5524","src/test_data/shift_jis_in_ref.txt":"512f985950ca902e643c88682dba9708b7c38d3c5ec2925168ab00ac94ab19f9","src/test_data/shift_jis_out.txt":"5fbc44da7bf639bf6cfe0fa1fd3eba7102b88f81919c9ea991302712f69426fb","src/test_data/shift_jis_out_ref.txt":"466322c6fed8286c64582731755290c2296508efdd258826e6279686649b481f","src/test_labels_names.rs":"c962c7aeac3d9ef2aca70c9e21983b231d4cf998cb06879374b0401e5149d1da","src/testing.rs":"60f85c6fb63fd4ab62e90dfa005920e79e0e1885795dc13a7a3c1980507925b1","src/utf_16.rs":"1d2c40857c946f6eecf724efc60a196865b4d84a59b08b42fbe4576fa8308fd0","src/utf_8.rs":"34218c7f4faa81883492fdfeb303b7e77710121b06e8342ac62ccb3d6eb16a37","src/utf_8_core.rs":"bbc010dbdfed0f5e7c48a1ab0772eaf2e27711b789bb82f71a678f2240158a65","src/variant.rs":"93dfec2dcfc9fd9711bb55d48177f4a0e9479c7fbd055f08db3853338569da83","src/x_user_defined.rs":"84d054eec249dd676452585f8eb13dc851095021ed6e1f8c79e952c6d81751df"},"package":"f5215aabf22b83153be3ee44dfe3f940214541b2ce13d419c55e7a115c8c51a9"} \ No newline at end of file +{"files":{".travis.yml":"dc509cc3b8f44fbdf1d806f533c3f005afaf0fd77cd266b38cb69bab3e4ea136","CONTRIBUTING.md":"6dac812ad206dbeb43b32ae01062fb79684fb01f9ee778c1c166852adc77d4c9","COPYRIGHT":"20d4fff11cca11529df3f02096fbe8ffe350219cdb07cdedea34e6a762866da5","Cargo.toml":"114f3399a97af04c9e2f8514448ccac81aac9ce7b333ec1594e733aad0c92e87","Ideas.md":"7fbeddb0f8ba7b233673ee705997adc2fddb1636a17fe662532b35ef2810a51d","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"74aa8b6d04c36bb640ee81187a3f24a2fa94e36d4c1d4f2ca164c3784ae87a83","README.md":"f60b9e7ff6d62f6cd580cbd386a039fda2d7407821da984fbe3cdb9c4a64f5d3","generate-encoding-data.py":"8a0a5162098d355e4df63532819769fd6626a66a0aa93f2762e315d6147aa0a5","rustfmt.toml":"c01c06dfbdfcf30730535aab911d69068febb921e2faef9571ceeb6a5c2a3eab","src/ascii.rs":"0fd4833571df22b0bb98e230c07b4ff733284c5b58b7b21a50f4f68c683ee706","src/big5.rs":"780ae537353f899a5772a9e7d062441041276e1eb1506a013e4280c5cda6bb93","src/data.rs":"412c842c698c3ce1cec4a27ab19ca275372ac28940ac49cdf3e0dad71a2c2812","src/euc_jp.rs":"feda0ade5e1c3e4abd7637c59373b977662007990fd164ea7db1acc502ba3534","src/euc_kr.rs":"2699c52055882e34ba4e12d072b8161c635840f3620075ca3f10986aec0e8d3b","src/gb18030.rs":"6a4d5ff9a89cdf1d89de78cd309f01385435dd9a4ffee182e13df2675cf57600","src/handles.rs":"c07a3738e43e8aae11108a30d34067c31ddc5d22074b85ef393f00abcc1f4e01","src/iso_2022_jp.rs":"1f780c3ff72f1a867d6c5782135cd01427eca6d74f0dd6cb23c1406b5163af1a","src/lib.rs":"b53cfe7009dcba83724ac2affa1f3fdd675451a33742ceb9f030eb83e702305f","src/macros.rs":"9ab30e7194f61f268cd7d899cabb06ff9ca7717663926fd583b20334f49ac8d3","src/mem.rs":"326003897f0efefa257210f4e698a2a039e7e9d2fe16e0fc9341b51a68ce1dff","src/replacement.rs":"782f03f04d110e9a0656262bf4296aa0ab8199e196cb63239c30d9649996caa4","src/shift_jis.rs":"84df4ff58b60e0827d6c0c7049f2cf19033f2b9e25a9186bcfb0bbb05e87b380","src/simd_funcs.rs":"76c4abc881f2dd91f8e936b059152fa4ee5056af0af59356fbf105436ddd673f","src/single_byte.rs":"b3fadb4fa1e66b00efc12b8850b3076580a8cd73c9ed810a19421fd3ade9bbf1","src/test_data/big5_in.txt":"4c5a8691f8dc717311889c63894026d2fb62725a86c4208ca274a9cc8d42a503","src/test_data/big5_in_ref.txt":"99d399e17750cf9c7cf30bb253dbfe35b81c4fcbdead93cfa48b1429213473c7","src/test_data/big5_out.txt":"6193ca97c297aa20e09396038d18e938bb7ea331c26f0f2454097296723a0b13","src/test_data/big5_out_ref.txt":"36567691f557df144f6cc520015a87038dfa156f296fcf103b56ae9a718be1fc","src/test_data/euc_kr_in.txt":"c86a7224f3215fa0d04e685622a752fdc72763e8ae076230c7fd62de57ec4074","src/test_data/euc_kr_in_ref.txt":"1f419f4ca47d708b54c73c461545a022ae2e20498fdbf8005a483d752a204883","src/test_data/euc_kr_out.txt":"e7f32e026f70be1e1b58e0047baf7d3d2c520269c4f9b9992e158b4decb0a1a3","src/test_data/euc_kr_out_ref.txt":"c9907857980b20b8e9e3b584482ed6567a2be6185d72237b6322f0404944924e","src/test_data/gb18030_in.txt":"ab7231b2d3e9afacdbd7d7f3b9e5361a7ff9f7e1cfdb4f3bd905b9362b309e53","src/test_data/gb18030_in_ref.txt":"dc5069421adca2043c55f5012b55a76fdff651d22e6e699fd0978f8d5706815c","src/test_data/gb18030_out.txt":"f0208d527f5ca63de7d9a0323be8d5cf12d8a104b2943d92c2701f0c3364dac1","src/test_data/gb18030_out_ref.txt":"6819fe47627e4ea01027003fc514b9f21a1322e732d7f1fb92cc6c5455bc6c07","src/test_data/iso_2022_jp_in.txt":"cd24bbdcb1834e25db54646fbf4c41560a13dc7540f6be3dba4f5d97d44513af","src/test_data/iso_2022_jp_in_ref.txt":"3dc4e6a5e06471942d086b16c9440945e78415f6f3f47e43717e4bc2eac2cdf5","src/test_data/iso_2022_jp_out.txt":"9b6f015329dda6c3f9ee5ce6dbd6fa9c89acc21283e886836c78b8d833480c21","src/test_data/iso_2022_jp_out_ref.txt":"78cb260093a20116ad9a42f43b05d1848c5ab100b6b9a850749809e943884b35","src/test_data/jis0208_in.txt":"6df3030553ffb0a6615bb33dc8ea9dca6d9623a9028e2ffec754ce3c3da824cc","src/test_data/jis0208_in_ref.txt":"3dc4e6a5e06471942d086b16c9440945e78415f6f3f47e43717e4bc2eac2cdf5","src/test_data/jis0208_out.txt":"4ec24477e1675ce750733bdc3c5add1cd27b6bd4ce1f09289564646e9654e857","src/test_data/jis0208_out_ref.txt":"c3e1cef5032b2b1d93a406f31ff940c4e2dfe8859b8b17ca2761fee7a75a0e48","src/test_data/jis0212_in.txt":"c011f0dd72bd7c8cd922df9374ef8d2769a77190514c77f6c62b415852eeb9fe","src/test_data/jis0212_in_ref.txt":"7d9458b3d2f73e7092a7f505c08ce1d233dde18aa679fbcf9889256239cc9e06","src/test_data/shift_jis_in.txt":"02e389ccef0dd2122e63f503899402cb7f797912c2444cc80ab93131116c5524","src/test_data/shift_jis_in_ref.txt":"512f985950ca902e643c88682dba9708b7c38d3c5ec2925168ab00ac94ab19f9","src/test_data/shift_jis_out.txt":"5fbc44da7bf639bf6cfe0fa1fd3eba7102b88f81919c9ea991302712f69426fb","src/test_data/shift_jis_out_ref.txt":"466322c6fed8286c64582731755290c2296508efdd258826e6279686649b481f","src/test_labels_names.rs":"c962c7aeac3d9ef2aca70c9e21983b231d4cf998cb06879374b0401e5149d1da","src/testing.rs":"16da398fdab694283d24556932ff7fce893e22cf66a180795a830541f4ddd94b","src/utf_16.rs":"1d2c40857c946f6eecf724efc60a196865b4d84a59b08b42fbe4576fa8308fd0","src/utf_8.rs":"dc7df98c65a23607071b699243aec75a461510ee0617abba289df1ebe781c08b","src/utf_8_core.rs":"bbc010dbdfed0f5e7c48a1ab0772eaf2e27711b789bb82f71a678f2240158a65","src/variant.rs":"93dfec2dcfc9fd9711bb55d48177f4a0e9479c7fbd055f08db3853338569da83","src/x_user_defined.rs":"84d054eec249dd676452585f8eb13dc851095021ed6e1f8c79e952c6d81751df"},"package":"98fd0f24d1fb71a4a6b9330c8ca04cbd4e7cc5d846b54ca74ff376bc7c9f798d"} \ No newline at end of file diff --git a/third_party/rust/encoding_rs/CONTRIBUTING.md b/third_party/rust/encoding_rs/CONTRIBUTING.md index 62f808646ab5..f8232f7703b8 100644 --- a/third_party/rust/encoding_rs/CONTRIBUTING.md +++ b/third_party/rust/encoding_rs/CONTRIBUTING.md @@ -37,8 +37,8 @@ rustc. ## rustfmt -Please install [`rustfmt`](https://github.com/rust-lang-nursery/rustfmt) and -run `cargo fmt` before creating a pull request. +The `rustfmt` version used for this code is 0.8.4. Please either use that +version or avoid using `rustfmt` (so as not to reformat all the code). ## Unit tests diff --git a/third_party/rust/encoding_rs/Cargo.toml b/third_party/rust/encoding_rs/Cargo.toml index 4790409c1b86..c73dc02ee759 100644 --- a/third_party/rust/encoding_rs/Cargo.toml +++ b/third_party/rust/encoding_rs/Cargo.toml @@ -12,7 +12,7 @@ [package] name = "encoding_rs" -version = "0.7.1" +version = "0.7.2" authors = ["Henri Sivonen "] description = "A Gecko-oriented implementation of the Encoding Standard" homepage = "https://docs.rs/encoding_rs/" @@ -24,6 +24,9 @@ license = "MIT/Apache-2.0" repository = "https://github.com/hsivonen/encoding_rs" [profile.release] lto = true +[dependencies.cfg-if] +version = "0.1.0" + [dependencies.serde] version = "1.0" optional = true @@ -31,20 +34,17 @@ optional = true [dependencies.simd] version = "0.2.0" optional = true +[dev-dependencies.bincode] +version = "0.8" -[dependencies.cfg-if] -version = "0.1.0" [dev-dependencies.serde_derive] version = "1.0" [dev-dependencies.serde_json] version = "1.0" -[dev-dependencies.bincode] -version = "0.8" - [features] -simd-accel = ["simd"] no-static-ideograph-encoder-tables = [] +simd-accel = ["simd"] [badges.travis-ci] repository = "hsivonen/encoding_rs" diff --git a/third_party/rust/encoding_rs/README.md b/third_party/rust/encoding_rs/README.md index a3398bb7a7f0..3e5d4e70fcd0 100644 --- a/third_party/rust/encoding_rs/README.md +++ b/third_party/rust/encoding_rs/README.md @@ -9,6 +9,11 @@ encoding_rs an implementation of the (non-JavaScript parts of) the [Encoding Standard](https://encoding.spec.whatwg.org/) written in Rust and used in Gecko (starting with Firefox 56). +Additionally, the `mem` module provides various operations for dealing with +in-RAM text (as opposed to data that's coming from or going to an IO boundary). +The `mem` module is a module instead of a separate crate due to internal +implementation detail efficiencies. + ## Functionality Due to the Gecko use case, encoding_rs supports decoding to and encoding from @@ -43,6 +48,26 @@ Specifically, encoding_rs does the following: workloads than the standard library; hopefully will get upstreamed some day) and ASCII. +Additionally, `encoding_rs::mem` does the following: + +* Checks if a byte buffer contains only ASCII. +* Checks if a potentially-invalid UTF-16 buffer contains only Basic Latin (ASCII). +* Checks if a valid UTF-8, potentially-invalid UTF-8 or potentially-invalid UTF-16 + buffer contains only Latin1 code points (below U+0100). +* Checks if a valid UTF-8, potentially-invalid UTF-8 or potentially-invalid UTF-16 + buffer or a code point or a UTF-16 code unit can trigger right-to-left behavior + (suitable for checking if the Unicode Bidirectional Algorithm can be optimized + out). +* Combined versions of the above two checks. +* Converts valid UTF-8, potentially-invalid UTF-8 and Latin1 to UTF-16. +* Converts potentially-invalid UTF-16 and Latin1 to UTF-8. +* Converts UTF-8 and UTF-16 to Latin1 (if in range). +* Finds the first invalid code unit in a buffer of potentially-invalid UTF-16. +* Makes a mutable buffer of potential-invalid UTF-16 contain valid UTF-16. +* Copies ASCII from one buffer to another up to the first non-ASCII byte. +* Converts ASCII to UTF-16 up to the first non-ASCII byte. +* Converts UTF-16 to ASCII up to the first non-Basic Latin code unit. + ## Licensing Please see the file named @@ -63,6 +88,8 @@ using the C++ standard library and [GSL](https://github.com/Microsoft/GSL/) type For the Gecko context, there's a [C++ wrapper using the MFBT/XPCOM types](https://searchfox.org/mozilla-central/source/intl/Encoding.h#100). +These bindings do not cover the `mem` module. + ## Sample programs * [Rust](https://github.com/hsivonen/recode_rs) @@ -133,9 +160,9 @@ decode-optimized tables. With realistic work loads, this seemed fast enough not to be user-visibly slow on Raspberry Pi 3 (which stood in for a phone for testing) in the Web-exposed encoder use cases. -A framework for measuring performance is [available separately][1]. +A framework for measuring performance is [available separately][2]. -[1]: https://github.com/hsivonen/encoding_bench/ +[2]: https://github.com/hsivonen/encoding_bench/ ## Rust Version Compatibility @@ -193,6 +220,12 @@ used in Firefox. ## Release Notes +### 0.7.2 + +* Add the `mem` module. +* Refactor SIMD code which can affect performance outside the `mem` + module. + ### 0.7.1 * When encoding from invalid UTF-16, correctly handle U+DC00 followed by diff --git a/third_party/rust/encoding_rs/src/ascii.rs b/third_party/rust/encoding_rs/src/ascii.rs index 2c89716f96a0..cbbdc70f927f 100644 --- a/third_party/rust/encoding_rs/src/ascii.rs +++ b/third_party/rust/encoding_rs/src/ascii.rs @@ -24,6 +24,14 @@ #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] use simd_funcs::*; +// `as` truncates, so works on 32-bit, too. +#[allow(dead_code)] +pub const ASCII_MASK: usize = 0x80808080_80808080u64 as usize; + +// `as` truncates, so works on 32-bit, too. +#[allow(dead_code)] +pub const BASIC_LATIN_MASK: usize = 0xFF80FF80_FF80FF80u64 as usize; + #[allow(unused_macros)] macro_rules! ascii_naive { ($name:ident, @@ -212,6 +220,62 @@ macro_rules! basic_latin_alu { }); } +#[allow(unused_macros)] +macro_rules! latin1_alu { + ($name:ident, + $src_unit:ty, + $dst_unit:ty, + $stride_fn:ident) => ( + #[cfg_attr(feature = "cargo-clippy", allow(never_loop))] + #[inline(always)] + pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) { + let mut offset = 0usize; + // This loop is only broken out of as a `goto` forward + loop { + let mut until_alignment = { + if ::std::mem::size_of::<$src_unit>() < ::std::mem::size_of::<$dst_unit>() { + // unpack + let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK; + if (dst.offset(src_until_alignment as isize) as usize) & ALIGNMENT_MASK != 0 { + break; + } + src_until_alignment + } else { + // pack + let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK; + if (src.offset(dst_until_alignment as isize) as usize) & ALIGNMENT_MASK != 0 { + break; + } + dst_until_alignment + } + }; + if until_alignment + STRIDE_SIZE <= len { + while until_alignment != 0 { + let code_unit = *(src.offset(offset as isize)); + *(dst.offset(offset as isize)) = code_unit as $dst_unit; + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - STRIDE_SIZE; + loop { + $stride_fn(src.offset(offset as isize) as *const usize, + dst.offset(offset as isize) as *mut usize); + offset += STRIDE_SIZE; + if offset > len_minus_stride { + break; + } + } + } + break; + } + while offset < len { + let code_unit = *(src.offset(offset as isize)); + *(dst.offset(offset as isize)) = code_unit as $dst_unit; + offset += 1; + } + }); +} + #[allow(unused_macros)] macro_rules! ascii_simd_check_align { ($name:ident, @@ -294,6 +358,89 @@ macro_rules! ascii_simd_check_align { }); } +#[allow(unused_macros)] +macro_rules! latin1_simd_check_align { + ($name:ident, + $src_unit:ty, + $dst_unit:ty, + $stride_both_aligned:ident, + $stride_src_aligned:ident, + $stride_dst_aligned:ident, + $stride_neither_aligned:ident) => ( + #[inline(always)] + pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) { + let mut offset = 0usize; + if STRIDE_SIZE <= len { + let len_minus_stride = len - STRIDE_SIZE; + // XXX Should we first process one stride unconditinoally as unaligned to + // avoid the cost of the branchiness below if the first stride fails anyway? + // XXX Should we just use unaligned SSE2 access unconditionally? It seems that + // on Haswell, it would make sense to just use unaligned and not bother + // checking. Need to benchmark older architectures before deciding. + let dst_masked = (dst as usize) & ALIGNMENT_MASK; + if ((src as usize) & ALIGNMENT_MASK) == 0 { + if dst_masked == 0 { + loop { + $stride_both_aligned(src.offset(offset as isize), + dst.offset(offset as isize)); + offset += STRIDE_SIZE; + if offset > len_minus_stride { + break; + } + } + } else { + loop { + $stride_src_aligned(src.offset(offset as isize), + dst.offset(offset as isize)); + offset += STRIDE_SIZE; + if offset > len_minus_stride { + break; + } + } + } + } else { + if dst_masked == 0 { + loop { + $stride_dst_aligned(src.offset(offset as isize), + dst.offset(offset as isize)); + offset += STRIDE_SIZE; + if offset > len_minus_stride { + break; + } + } + } else { + loop { + $stride_neither_aligned(src.offset(offset as isize), + dst.offset(offset as isize)); + offset += STRIDE_SIZE; + if offset > len_minus_stride { + break; + } + } + } + } + } + while offset < len { + let code_unit = *(src.offset(offset as isize)); + // On x86_64, this loop autovectorizes but in the pack + // case there are instructions whose purpose is to make sure + // each u16 in the vector is truncated before packing. However, + // since we don't care about saturating behavior of SSE2 packing + // when the input isn't Latin1, those instructions are useless. + // Unfortunately, using the `assume` intrinsic to lie to the + // optimizer doesn't make LLVM omit the trunctation that we + // don't need. Possibly this loop could be manually optimized + // to do the sort of thing that LLVM does but without the + // ANDing the read vectors of u16 with a constant that discards + // the high half of each u16. As far as I can tell, the + // optimization assumes that doing a SIMD read past the end of + // the array is OK. + *(dst.offset(offset as isize)) = code_unit as $dst_unit; + offset += 1; + } + }); +} + #[allow(unused_macros)] macro_rules! ascii_simd_unalign { ($name:ident, @@ -328,6 +475,34 @@ macro_rules! ascii_simd_unalign { }); } +#[allow(unused_macros)] +macro_rules! latin1_simd_unalign { + ($name:ident, + $src_unit:ty, + $dst_unit:ty, + $stride_neither_aligned:ident) => ( + #[inline(always)] + pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) { + let mut offset = 0usize; + if STRIDE_SIZE <= len { + let len_minus_stride = len - STRIDE_SIZE; + loop { + $stride_neither_aligned(src.offset(offset as isize), + dst.offset(offset as isize)); + offset += STRIDE_SIZE; + if offset > len_minus_stride { + break; + } + } + } + while offset < len { + let code_unit = *(src.offset(offset as isize)); + *(dst.offset(offset as isize)) = code_unit as $dst_unit; + offset += 1; + } + }); +} + #[allow(unused_macros)] macro_rules! ascii_to_ascii_simd_stride { ($name:ident, @@ -336,7 +511,7 @@ macro_rules! ascii_to_ascii_simd_stride { #[inline(always)] pub unsafe fn $name(src: *const u8, dst: *mut u8) -> bool { let simd = $load(src); - if !is_ascii(simd) { + if !simd_is_ascii(simd) { return false; } $store(dst, simd); @@ -352,7 +527,7 @@ macro_rules! ascii_to_basic_latin_simd_stride { #[inline(always)] pub unsafe fn $name(src: *const u8, dst: *mut u16) -> bool { let simd = $load(src); - if !is_ascii(simd) { + if !simd_is_ascii(simd) { return false; } let (first, second) = simd_unpack(simd); @@ -362,6 +537,20 @@ macro_rules! ascii_to_basic_latin_simd_stride { }); } +#[allow(unused_macros)] +macro_rules! unpack_simd_stride { + ($name:ident, + $load:ident, + $store:ident) => ( + #[inline(always)] + pub unsafe fn $name(src: *const u8, dst: *mut u16) { + let simd = $load(src); + let (first, second) = simd_unpack(simd); + $store(dst, first); + $store(dst.offset(8), second); + }); +} + #[allow(unused_macros)] macro_rules! basic_latin_to_ascii_simd_stride { ($name:ident, @@ -371,7 +560,7 @@ macro_rules! basic_latin_to_ascii_simd_stride { pub unsafe fn $name(src: *const u16, dst: *mut u8) -> bool { let first = $load(src); let second = $load(src.offset(8)); - if is_basic_latin(first | second) { + if simd_is_basic_latin(first | second) { $store(dst, simd_pack(first, second)); true } else { @@ -380,23 +569,40 @@ macro_rules! basic_latin_to_ascii_simd_stride { }); } +#[allow(unused_macros)] +macro_rules! pack_simd_stride { + ($name:ident, + $load:ident, + $store:ident) => ( + #[inline(always)] + pub unsafe fn $name(src: *const u16, dst: *mut u8) { + let first = $load(src); + let second = $load(src.offset(8)); + $store(dst, simd_pack(first, second)); + }); +} + cfg_if! { if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "aarch64"))] { // SIMD with the same instructions for aligned and unaligned loads and stores pub const STRIDE_SIZE: usize = 16; - const ALIGNMENT: usize = 8; +// pub const ALIGNMENT: usize = 8; ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned); ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned); + unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned); basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned); + pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned); ascii_simd_unalign!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_neither_aligned); ascii_simd_unalign!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_neither_aligned); ascii_simd_unalign!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_neither_aligned); + latin1_simd_unalign!(unpack_latin1, u8, u16, unpack_stride_neither_aligned); + latin1_simd_unalign!(pack_latin1, u16, u8, pack_stride_neither_aligned); } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] { // SIMD with different instructions for aligned and unaligned loads and stores. // @@ -406,7 +612,7 @@ cfg_if! { pub const STRIDE_SIZE: usize = 16; - const ALIGNMENT_MASK: usize = 15; + pub const ALIGNMENT_MASK: usize = 15; ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned); ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned); @@ -418,31 +624,37 @@ cfg_if! { ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_dst_aligned, load16_unaligned, store8_aligned); ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned); + unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned); + unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned); + unpack_simd_stride!(unpack_stride_dst_aligned, load16_unaligned, store8_aligned); + unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned); + basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned); basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned); basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned); basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned); + pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned); + pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned); + pack_simd_stride!(pack_stride_dst_aligned, load8_unaligned, store16_aligned); + pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned); + ascii_simd_check_align!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_dst_aligned, ascii_to_ascii_stride_neither_aligned); ascii_simd_check_align!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_dst_aligned, ascii_to_basic_latin_stride_neither_aligned); ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned); + latin1_simd_check_align!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned); + latin1_simd_check_align!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned); } else if #[cfg(all(target_endian = "little", target_pointer_width = "64"))] { // Aligned ALU word, little-endian, 64-bit pub const STRIDE_SIZE: usize = 16; - const ALIGNMENT: usize = 8; + pub const ALIGNMENT: usize = 8; - const ALIGNMENT_MASK: usize = 7; + pub const ALIGNMENT_MASK: usize = 7; #[inline(always)] - unsafe fn ascii_to_basic_latin_stride_little_64(src: *const usize, dst: *mut usize) -> bool { - let word = *src; - let second_word = *(src.offset(1)); - // Check if the words contains non-ASCII - if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 { - return false; - } + unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) { let first = ((0x00000000_FF000000usize & word) << 24) | ((0x00000000_00FF0000usize & word) << 16) | ((0x00000000_0000FF00usize & word) << 8) | @@ -463,18 +675,10 @@ cfg_if! { *(dst.offset(1)) = second; *(dst.offset(2)) = third; *(dst.offset(3)) = fourth; - true } #[inline(always)] - unsafe fn basic_latin_to_ascii_stride_little_64(src: *const usize, dst: *mut usize) -> bool { - let first = *src; - let second = *(src.offset(1)); - let third = *(src.offset(2)); - let fourth = *(src.offset(3)); - if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 { - return false; - } + unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) { let word = ((0x00FF0000_00000000usize & second) << 8) | ((0x000000FF_00000000usize & second) << 16) | ((0x00000000_00FF0000usize & second) << 24) | @@ -493,28 +697,18 @@ cfg_if! { (0x00000000_000000FFusize & third); *dst = word; *(dst.offset(1)) = second_word; - true } - - basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_little_64); - basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_little_64); } else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] { // Aligned ALU word, little-endian, 32-bit pub const STRIDE_SIZE: usize = 8; - const ALIGNMENT: usize = 4; + pub const ALIGNMENT: usize = 4; - const ALIGNMENT_MASK: usize = 3; + pub const ALIGNMENT_MASK: usize = 3; #[inline(always)] - unsafe fn ascii_to_basic_latin_stride_little_32(src: *const usize, dst: *mut usize) -> bool { - let word = *src; - let second_word = *(src.offset(1)); - // Check if the words contains non-ASCII - if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 { - return false; - } + unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) { let first = ((0x0000FF00usize & word) << 8) | (0x000000FFusize & word); let second = ((0xFF000000usize & word) >> 8) | @@ -527,18 +721,10 @@ cfg_if! { *(dst.offset(1)) = second; *(dst.offset(2)) = third; *(dst.offset(3)) = fourth; - return true; } #[inline(always)] - unsafe fn basic_latin_to_ascii_stride_little_32(src: *const usize, dst: *mut usize) -> bool { - let first = *src; - let second = *(src.offset(1)); - let third = *(src.offset(2)); - let fourth = *(src.offset(3)); - if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 { - return false; - } + unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) { let word = ((0x00FF0000usize & second) << 8) | ((0x000000FFusize & second) << 16) | ((0x00FF0000usize & first) >> 8) | @@ -549,28 +735,18 @@ cfg_if! { (0x000000FFusize & third); *dst = word; *(dst.offset(1)) = second_word; - return true; } - - basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_little_32); - basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_little_32); } else if #[cfg(all(target_endian = "big", target_pointer_width = "64"))] { // Aligned ALU word, big-endian, 64-bit pub const STRIDE_SIZE: usize = 16; - const ALIGNMENT: usize = 8; + pub const ALIGNMENT: usize = 8; - const ALIGNMENT_MASK: usize = 7; + pub const ALIGNMENT_MASK: usize = 7; #[inline(always)] - unsafe fn ascii_to_basic_latin_stride_big_64(src: *const usize, dst: *mut usize) -> bool { - let word = *src; - let second_word = *(src.offset(1)); - // Check if the words contains non-ASCII - if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 { - return false; - } + unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) { let first = ((0xFF000000_00000000usize & word) >> 8) | ((0x00FF0000_00000000usize & word) >> 16) | ((0x0000FF00_00000000usize & word) >> 24) | @@ -591,18 +767,10 @@ cfg_if! { *(dst.offset(1)) = second; *(dst.offset(2)) = third; *(dst.offset(3)) = fourth; - return true; } #[inline(always)] - unsafe fn basic_latin_to_ascii_stride_big_64(src: *const usize, dst: *mut usize) -> bool { - let first = *src; - let second = *(src.offset(1)); - let third = *(src.offset(2)); - let fourth = *(src.offset(3)); - if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 { - return false; - } + unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) { let word = ((0x00FF0000_00000000usize & first) << 8) | ((0x000000FF_00000000usize & first) << 16) | ((0x00000000_00FF0000usize & first) << 24) | @@ -621,28 +789,18 @@ cfg_if! { (0x00000000_000000FFusize & fourth); *dst = word; *(dst.offset(1)) = second_word; - return true; } - - basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_big_64); - basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_big_64); } else if #[cfg(all(target_endian = "big", target_pointer_width = "32"))] { // Aligned ALU word, big-endian, 32-bit pub const STRIDE_SIZE: usize = 8; - const ALIGNMENT: usize = 4; + pub const ALIGNMENT: usize = 4; - const ALIGNMENT_MASK: usize = 3; + pub const ALIGNMENT_MASK: usize = 3; #[inline(always)] - unsafe fn ascii_to_basic_latin_stride_big_32(src: *const usize, dst: *mut usize) -> bool { - let word = *src; - let second_word = *(src.offset(1)); - // Check if the words contains non-ASCII - if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 { - return false; - } + unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) { let first = ((0xFF000000usize & word) >> 8) | ((0x00FF0000usize & word) >> 16); let second = ((0x0000FF00usize & word) << 8) | @@ -655,18 +813,10 @@ cfg_if! { *(dst.offset(1)) = second; *(dst.offset(2)) = third; *(dst.offset(3)) = fourth; - return true; } #[inline(always)] - unsafe fn basic_latin_to_ascii_stride_big_32(src: *const usize, dst: *mut usize) -> bool { - let first = *src; - let second = *(src.offset(1)); - let third = *(src.offset(2)); - let fourth = *(src.offset(3)); - if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 { - return false; - } + unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) { let word = ((0x00FF0000usize & first) << 8) | ((0x000000FFusize & first) << 16) | ((0x00FF0000usize & second) >> 8) | @@ -677,11 +827,7 @@ cfg_if! { (0x000000FFusize & fourth); *dst = word; *(dst.offset(1)) = second_word; - return true; } - - basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_big_32); - basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_big_32); } else { ascii_naive!(ascii_to_ascii, u8, u8); ascii_naive!(ascii_to_basic_latin, u8, u16); @@ -716,7 +862,7 @@ cfg_if! { let len_minus_stride = len - STRIDE_SIZE; loop { let simd = unsafe { load16_unaligned(src.offset(offset as isize)) }; - if !is_ascii(simd) { + if !simd_is_ascii(simd) { break; } offset += STRIDE_SIZE; @@ -787,9 +933,51 @@ cfg_if! { None } } else { - // `as` truncates, so works on 32-bit, too. - const ASCII_MASK: usize = 0x80808080_80808080u64 as usize; - const BASIC_LATIN_MASK: usize = 0xFF80FF80_FF80FF80u64 as usize; + #[inline(always)] + unsafe fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) { + let word = *src; + let second_word = *(src.offset(1)); + unpack_alu(word, second_word, dst); + } + + #[inline(always)] + unsafe fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) { + let first = *src; + let second = *(src.offset(1)); + let third = *(src.offset(2)); + let fourth = *(src.offset(3)); + pack_alu(first, second, third, fourth, dst); + } + + #[inline(always)] + unsafe fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool { + let word = *src; + let second_word = *(src.offset(1)); + // Check if the words contains non-ASCII + if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 { + return false; + } + unpack_alu(word, second_word, dst); + true + } + + #[inline(always)] + unsafe fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool { + let first = *src; + let second = *(src.offset(1)); + let third = *(src.offset(2)); + let fourth = *(src.offset(3)); + if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 { + return false; + } + pack_alu(first, second, third, fourth, dst); + true + } + + basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_alu); + basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_alu); + latin1_alu!(unpack_latin1, u8, u16, unpack_latin1_stride_alu); + latin1_alu!(pack_latin1, u16, u8, pack_latin1_stride_alu); #[inline(always)] unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option { diff --git a/third_party/rust/encoding_rs/src/gb18030.rs b/third_party/rust/encoding_rs/src/gb18030.rs index f4198e1c298e..2835263d3260 100644 --- a/third_party/rust/encoding_rs/src/gb18030.rs +++ b/third_party/rust/encoding_rs/src/gb18030.rs @@ -394,9 +394,7 @@ fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { // PUA between Hanzi Levels let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810); if bmp_minus_pua_between_hanzi < 5 { - return Some( - (0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize), - ); + return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize)); } None } @@ -595,6 +593,14 @@ mod tests { // 0xFF decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}"); + decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} ! + decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} ! + decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}"); + decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}"); + decode_gb18030( + b"\xFF\x32\x9A\x33\x00", + "\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}", + ); // Four bytes decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}"); @@ -605,7 +611,7 @@ mod tests { decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}"); decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}"); decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} ! - + decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}"); } #[test] diff --git a/third_party/rust/encoding_rs/src/lib.rs b/third_party/rust/encoding_rs/src/lib.rs index 2886d68da35d..8d5b5723c55c 100644 --- a/third_party/rust/encoding_rs/src/lib.rs +++ b/third_party/rust/encoding_rs/src/lib.rs @@ -8,7 +8,7 @@ // except according to those terms. #![cfg_attr(feature = "cargo-clippy", allow(doc_markdown, inline_always, new_ret_no_self))] -#![doc(html_root_url = "https://docs.rs/encoding_rs/0.7.1")] +#![doc(html_root_url = "https://docs.rs/encoding_rs/0.7.2")] //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust. @@ -17,6 +17,10 @@ //! streamability goals are browser-oriented, and that FFI-friendliness is a //! goal. //! +//! Additionally, the `mem` module provides functions that are useful for +//! applications that need to be able to deal with legacy in-memory +//! representations of Unicode. +//! //! # Availability //! //! The code is available under the @@ -491,7 +495,7 @@ //! //! -#![cfg_attr(feature = "simd-accel", feature(cfg_target_feature, platform_intrinsics))] +#![cfg_attr(feature = "simd-accel", feature(cfg_target_feature, platform_intrinsics, core_intrinsics))] #[macro_use] extern crate cfg_if; @@ -539,6 +543,8 @@ mod handles; mod data; mod variant; +pub mod mem; + use variant::*; use utf_8::utf8_valid_up_to; use ascii::ascii_valid_up_to; @@ -2030,20 +2036,20 @@ static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 219] = [&WINDOWS_1252_INIT, /// # Streaming vs. Non-Streaming /// /// When you have the entire input in a single buffer, you can use the -/// methods [`decode()`][1], [`decode_with_bom_removal()`][2], -/// [`decode_without_bom_handling()`][3], -/// [`decode_without_bom_handling_and_without_replacement()`][4] and -/// [`encode()`][5]. (These methods are available to Rust callers only and are +/// methods [`decode()`][3], [`decode_with_bom_removal()`][3], +/// [`decode_without_bom_handling()`][5], +/// [`decode_without_bom_handling_and_without_replacement()`][6] and +/// [`encode()`][7]. (These methods are available to Rust callers only and are /// not available in the C API.) Unlike the rest of the API available to Rust, /// these methods perform heap allocations. You should the `Decoder` and /// `Encoder` objects when your input is split into multiple buffers or when /// you want to control the allocation of the output buffers. /// -/// [1]: #method.decode -/// [2]: #method.decode_with_bom_removal -/// [3]: #method.decode_without_bom_handling -/// [4]: #method.decode_without_bom_handling_and_without_replacement -/// [5]: #method.encode +/// [3]: #method.decode +/// [4]: #method.decode_with_bom_removal +/// [5]: #method.decode_without_bom_handling +/// [6]: #method.decode_without_bom_handling_and_without_replacement +/// [7]: #method.encode /// /// # Instances /// @@ -2222,6 +2228,7 @@ impl Encoding { /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`. /// /// Available via the C wrapper. + #[inline] pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> { match Encoding::for_label(label) { None => None, @@ -2246,6 +2253,7 @@ impl Encoding { /// or UTF-16BE BOM or `None` otherwise. /// /// Available via the C wrapper. + #[inline] pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> { if buffer.starts_with(b"\xEF\xBB\xBF") { Some((UTF_8, 3)) @@ -2264,6 +2272,7 @@ impl Encoding { /// `document.characterSet` property. /// /// Available via the C wrapper. + #[inline] pub fn name(&'static self) -> &'static str { self.name } @@ -2272,6 +2281,7 @@ impl Encoding { /// `char`. (Only true if the output encoding is UTF-8.) /// /// Available via the C wrapper. + #[inline] pub fn can_encode_everything(&'static self) -> bool { self.output_encoding() == UTF_8 } @@ -2280,12 +2290,14 @@ impl Encoding { /// U+0000...U+007F and vice versa. /// /// Available via the C wrapper. + #[inline] pub fn is_ascii_compatible(&'static self) -> bool { !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP) } /// Checks whether the bytes 0x00...0x7F map mostly to the characters /// U+0000...U+007F and vice versa. + #[inline] fn is_potentially_borrowable(&'static self) -> bool { !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE) } @@ -2294,6 +2306,7 @@ impl Encoding { /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise. /// /// Available via the C wrapper. + #[inline] pub fn output_encoding(&'static self) -> &'static Encoding { if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE { UTF_8 @@ -2336,6 +2349,7 @@ impl Encoding { /// `usize`. /// /// Available to Rust only. + #[inline] pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) { let (encoding, without_bom) = match Encoding::for_bom(bytes) { Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]), @@ -2378,6 +2392,7 @@ impl Encoding { /// `usize`. /// /// Available to Rust only. + #[inline] pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) { let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") { &bytes[3..] @@ -2689,6 +2704,7 @@ impl Encoding { /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. /// /// Available via the C wrapper. + #[inline] pub fn new_decoder(&'static self) -> Decoder { Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff) } @@ -2702,6 +2718,7 @@ impl Encoding { /// encoding. /// /// Available via the C wrapper. + #[inline] pub fn new_decoder_with_bom_removal(&'static self) -> Decoder { Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove) } @@ -2717,6 +2734,7 @@ impl Encoding { /// instead of this method to cause the BOM to be removed. /// /// Available via the C wrapper. + #[inline] pub fn new_decoder_without_bom_handling(&'static self) -> Decoder { Decoder::new(self, self.new_variant_decoder(), BomHandling::Off) } @@ -2724,6 +2742,7 @@ impl Encoding { /// Instantiates a new encoder for the output encoding of this encoding. /// /// Available via the C wrapper. + #[inline] pub fn new_encoder(&'static self) -> Encoder { let enc = self.output_encoding(); enc.variant.new_encoder(enc) @@ -2767,6 +2786,7 @@ impl Encoding { } impl PartialEq for Encoding { + #[inline] fn eq(&self, other: &Encoding) -> bool { (self as *const Encoding) == (other as *const Encoding) } @@ -2775,12 +2795,14 @@ impl PartialEq for Encoding { impl Eq for Encoding {} impl Hash for Encoding { + #[inline] fn hash(&self, state: &mut H) { (self as *const Encoding).hash(state); } } impl std::fmt::Debug for Encoding { + #[inline] fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "Encoding {{ {} }}", self.name) } @@ -2788,6 +2810,7 @@ impl std::fmt::Debug for Encoding { #[cfg(feature = "serde")] impl Serialize for Encoding { + #[inline] fn serialize(&self, serializer: S) -> Result where S: Serializer { @@ -3054,6 +3077,7 @@ impl Decoder { /// of the decoder. /// /// Available via the C wrapper. + #[inline] pub fn encoding(&self) -> &'static Encoding { self.encoding } @@ -3769,12 +3793,14 @@ impl Encoder { } /// The `Encoding` this `Encoder` is for. + #[inline] pub fn encoding(&self) -> &'static Encoding { self.encoding } /// Returns `true` if this is an ISO-2022-JP encoder that's not in the /// ASCII state and `false` otherwise. + #[inline] pub fn has_pending_state(&self) -> bool { self.variant.has_pending_state() } @@ -4111,6 +4137,16 @@ fn in_range16(i: u16, start: u16, end: u16) -> bool { i.wrapping_sub(start) < (end - start) } +#[inline(always)] +fn in_range32(i: u32, start: u32, end: u32) -> bool { + i.wrapping_sub(start) < (end - start) +} + +#[inline(always)] +fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool { + i.wrapping_sub(start) <= (end - start) +} + #[inline(always)] fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool { i.wrapping_sub(start) <= (end - start) diff --git a/third_party/rust/encoding_rs/src/mem.rs b/third_party/rust/encoding_rs/src/mem.rs new file mode 100644 index 000000000000..b26593855190 --- /dev/null +++ b/third_party/rust/encoding_rs/src/mem.rs @@ -0,0 +1,2873 @@ +// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Functions for converting between different in-RAM representations of text +//! and for quickly checking if the Unicode Bidirectional Algorithm can be +//! avoided. +//! +//! By using slices for output, the functions here seek to enable by-register +//! (ALU register or SIMD register as available) operations in order to +//! outperform iterator-based conversions available in the Rust standard +//! library. +//! +//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to +//! U+00FF, inclusive, and does not refer to the windows-1252 range. This +//! in-memory encoding is sometimes used as a storage optimization of text +//! when UTF-16 indexing and length semantics are exposed. + +use ascii::*; +use super::in_inclusive_range8; +use super::in_inclusive_range16; +use super::in_range16; +use super::in_inclusive_range32; +use super::in_range32; +use super::DecoderResult; +use super::EncoderResult; +use utf_8::*; + +cfg_if!{ + if #[cfg(feature = "simd-accel")] { + use ::std::intrinsics::unlikely; + } else { + #[inline(always)] + // Unsafe to match the intrinsic, which is needlessly unsafe. + unsafe fn unlikely(b: bool) -> bool { + b + } + } +} + +/// Classification of text as Latin1 (all code points are below U+0100), +/// left-to-right with some non-Latin1 characters or as containing at least +/// some right-to-left characters. +#[must_use] +#[derive(Debug, PartialEq, Eq)] +#[repr(C)] +pub enum Latin1Bidi { + /// Every character is below U+0100. + Latin1 = 0, + /// There is at least one character that's U+0100 or higher, but there + /// are no right-to-left characters. + LeftToRight = 1, + /// There is at least one right-to-left character. + Bidi = 2, +} + +// `as` truncates, so works on 32-bit, too. +#[allow(dead_code)] +const LATIN1_MASK: usize = 0xFF00FF00_FF00FF00u64 as usize; + +#[allow(unused_macros)] +macro_rules! by_unit_check_alu { + ($name:ident, + $unit:ty, + $bound:expr, + $mask:ident) => ( + #[inline(always)] + fn $name(buffer: &[$unit]) -> bool { + let mut offset = 0usize; + let mut accu = 0usize; + let unit_size = ::std::mem::size_of::<$unit>(); + let len = buffer.len(); + if len >= ALIGNMENT / unit_size { + // The most common reason to return `false` is for the first code + // unit to fail the test, so check that first. + if buffer[0] >= $bound { + return false; + } + let src = buffer.as_ptr(); + let mut until_alignment = ((ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & + ALIGNMENT_MASK) / unit_size; + if until_alignment + ALIGNMENT / unit_size <= len { + if until_alignment != 0 { + accu |= buffer[offset] as usize; + offset += 1; + until_alignment -= 1; + while until_alignment != 0 { + accu |= buffer[offset] as usize; + offset += 1; + until_alignment -= 1; + } + if accu >= $bound { + return false; + } + } + let len_minus_stride = len - ALIGNMENT / unit_size; + if offset + (4 * (ALIGNMENT / unit_size)) <= len { + let len_minus_unroll = len - (4 * (ALIGNMENT / unit_size)); + loop { + let unroll_accu = unsafe { *(src.offset(offset as isize) as *const usize) } | + unsafe { *(src.offset((offset + (ALIGNMENT / unit_size)) as isize) as *const usize) } | + unsafe { *(src.offset((offset + (2 * (ALIGNMENT / unit_size))) as isize) as *const usize) } | + unsafe { *(src.offset((offset + (3 * (ALIGNMENT / unit_size))) as isize) as *const usize) }; + if unroll_accu & $mask != 0 { + return false; + } + offset += 4 * (ALIGNMENT / unit_size); + if offset > len_minus_unroll { + break; + } + } + } + while offset <= len_minus_stride { + accu |= unsafe { *(src.offset(offset as isize) as *const usize) }; + offset += ALIGNMENT / unit_size; + } + } + } + for &unit in &buffer[offset..] { + accu |= unit as usize; + } + accu & $mask == 0 + }) +} + +#[allow(unused_macros)] +macro_rules! by_unit_check_simd { + ($name:ident, + $unit:ty, + $splat:expr, + $simd_ty:ty, + $bound:expr, + $func:ident) => ( + #[inline(always)] + fn $name(buffer: &[$unit]) -> bool { + let mut offset = 0usize; + let mut accu = 0usize; + let unit_size = ::std::mem::size_of::<$unit>(); + let len = buffer.len(); + if len >= STRIDE_SIZE / unit_size { + // The most common reason to return `false` is for the first code + // unit to fail the test, so check that first. + if buffer[0] >= $bound { + return false; + } + let src = buffer.as_ptr(); + let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & + SIMD_ALIGNMENT_MASK) / unit_size; + if until_alignment + STRIDE_SIZE / unit_size <= len { + if until_alignment != 0 { + accu |= buffer[offset] as usize; + offset += 1; + until_alignment -= 1; + while until_alignment != 0 { + accu |= buffer[offset] as usize; + offset += 1; + until_alignment -= 1; + } + if accu >= $bound { + return false; + } + } + let len_minus_stride = len - STRIDE_SIZE / unit_size; + if offset + (4 * (STRIDE_SIZE / unit_size)) <= len { + let len_minus_unroll = len - (4 * (STRIDE_SIZE / unit_size)); + loop { + let unroll_accu = unsafe { *(src.offset(offset as isize) as *const $simd_ty) } | + unsafe { *(src.offset((offset + (STRIDE_SIZE / unit_size)) as isize) as *const $simd_ty) } | + unsafe { *(src.offset((offset + (2 * (STRIDE_SIZE / unit_size))) as isize) as *const $simd_ty) } | + unsafe { *(src.offset((offset + (3 * (STRIDE_SIZE / unit_size))) as isize) as *const $simd_ty) }; + if !$func(unroll_accu) { + return false; + } + offset += 4 * (STRIDE_SIZE / unit_size); + if offset > len_minus_unroll { + break; + } + } + } + let mut simd_accu = $splat; + while offset <= len_minus_stride { + simd_accu = simd_accu | unsafe { *(src.offset(offset as isize) as *const $simd_ty) }; + offset += STRIDE_SIZE / unit_size; + } + if !$func(simd_accu) { + return false; + } + } + } + for &unit in &buffer[offset..] { + accu |= unit as usize; + } + accu < $bound + }) +} + +cfg_if!{ + if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] { + use simd_funcs::*; + use simd::u8x16; + use simd::u16x8; + + const SIMD_ALIGNMENT: usize = 16; + + const SIMD_ALIGNMENT_MASK: usize = 15; + + by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii); + by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin); + by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1); + + #[inline(always)] + fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { + // This function is a mess, because it simultaneously tries to do + // only aligned SIMD (perhaps misguidedly) and needs to deal with + // the last code unit in a SIMD stride being part of a valid + // surrogate pair. + let unit_size = ::std::mem::size_of::(); + let src = buffer.as_ptr(); + let len = buffer.len(); + let mut offset = 0usize; + 'outer: loop { + let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.offset(offset as isize) } as usize) & SIMD_ALIGNMENT_MASK)) & + SIMD_ALIGNMENT_MASK) / unit_size; + if until_alignment == 0 { + if offset + STRIDE_SIZE / unit_size > len { + break; + } + } else { + let offset_plus_until_alignment = offset + until_alignment; + let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1; + if offset_plus_until_alignment_plus_one + STRIDE_SIZE / unit_size > len { + break; + } + let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]); + if up_to < until_alignment { + return offset + up_to; + } + if last_valid_low { + offset = offset_plus_until_alignment_plus_one; + continue; + } + offset = offset_plus_until_alignment; + } + let len_minus_stride = len - STRIDE_SIZE / unit_size; + 'inner: loop { + let offset_plus_stride = offset + STRIDE_SIZE / unit_size; + if contains_surrogates(unsafe { *(src.offset(offset as isize) as *const u16x8) }) { + if offset_plus_stride == len { + break 'outer; + } + let offset_plus_stride_plus_one = offset_plus_stride + 1; + let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]); + if up_to < STRIDE_SIZE / unit_size { + return offset + up_to; + } + if last_valid_low { + offset = offset_plus_stride_plus_one; + continue 'outer; + } + } + offset = offset_plus_stride; + if offset > len_minus_stride { + break 'outer; + } + } + } + let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]); + offset + up_to + } + } else { + by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK); + by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK); + by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK); + + #[inline(always)] + fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { + let (up_to, _) = utf16_valid_up_to_alu(buffer); + up_to + } + } +} + +/// The second return value is true iff the last code unit of the slice was +/// reached and turned out to be a low surrogate that is part of a valid pair. +#[inline(always)] +fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) { + let len = buffer.len(); + if len == 0 { + return (0, false); + } + let mut offset = 0usize; + loop { + let unit = buffer[offset]; + let next = offset + 1; + let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); + if unit_minus_surrogate_start > (0xDFFF - 0xD800) { + // Not a surrogate + offset = next; + if offset == len { + return (offset, false); + } + continue; + } + if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { + // high surrogate + if next < len { + let second = buffer[next]; + let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); + if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { + // The next code unit is a low surrogate. Advance position. + offset = next + 1; + if offset == len { + return (offset, true); + } + continue; + } + // The next code unit is not a low surrogate. Don't advance + // position and treat the high surrogate as unpaired. + // fall through + } + // Unpaired, fall through + } + // Unpaired surrogate + return (offset, false); + } +} + +cfg_if!{ + if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] { + #[inline(always)] + fn is_str_latin1_impl(buffer: &str) -> Option { + let mut offset = 0usize; + let bytes = buffer.as_bytes(); + let len = bytes.len(); + if len >= STRIDE_SIZE { + let src = bytes.as_ptr(); + let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & + SIMD_ALIGNMENT_MASK; + if until_alignment + STRIDE_SIZE <= len { + while until_alignment != 0 { + if bytes[offset] > 0xC3 { + return Some(offset); + } + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - STRIDE_SIZE; + loop { + if !simd_is_str_latin1(unsafe { *(src.offset(offset as isize) as *const u8x16) }) { + // TODO: Ensure this compiles away when inlined into `is_str_latin1()`. + while bytes[offset] & 0xC0 == 0x80 { + offset += 1; + } + return Some(offset); + } + offset += STRIDE_SIZE; + if offset > len_minus_stride { + break; + } + } + } + } + for i in offset..len { + if bytes[i] > 0xC3 { + return Some(i); + } + } + None + } + } else { + #[inline(always)] + fn is_str_latin1_impl(buffer: &str) -> Option { + let mut bytes = buffer.as_bytes(); + let mut total = 0; + loop { + if let Some((byte, offset)) = validate_ascii(bytes) { + total += offset; + if byte > 0xC3 { + return Some(total); + } + bytes = &bytes[offset + 2..]; + total += 2; + } else { + return None; + } + } + } + } +} + +#[inline(always)] +fn is_utf8_latin1_impl(buffer: &[u8]) -> Option { + let mut bytes = buffer; + let mut total = 0; + loop { + if let Some((byte, offset)) = validate_ascii(bytes) { + total += offset; + if in_inclusive_range8(byte, 0xC2, 0xC3) { + let next = offset + 1; + if next == bytes.len() { + return Some(total); + } + if bytes[next] & 0xC0 != 0x80 { + return Some(total); + } + bytes = &bytes[offset + 2..]; + total += 2; + } else { + return Some(total); + } + } else { + return None; + } + } +} + +cfg_if!{ + if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] { + #[inline(always)] + fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { + let mut offset = 0usize; + let len = buffer.len(); + if len >= STRIDE_SIZE / 2 { + let src = buffer.as_ptr(); + let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & + SIMD_ALIGNMENT_MASK) / 2; + if until_alignment + (STRIDE_SIZE / 2) <= len { + while until_alignment != 0 { + if is_utf16_code_unit_bidi(buffer[offset]) { + return true; + } + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - (STRIDE_SIZE / 2); + loop { + if is_u16x8_bidi(unsafe { *(src.offset(offset as isize) as *const u16x8) }) { + return true; + } + offset += STRIDE_SIZE / 2; + if offset > len_minus_stride { + break; + } + } + } + } + for &u in &buffer[offset..] { + if is_utf16_code_unit_bidi(u) { + return true; + } + } + false + } + } else { + #[inline(always)] + fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { + for &u in buffer { + if is_utf16_code_unit_bidi(u) { + return true; + } + } + false + } + } +} + +cfg_if!{ + if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] { + #[inline(always)] + fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { + let mut offset = 0usize; + let len = buffer.len(); + if len >= STRIDE_SIZE / 2 { + let src = buffer.as_ptr(); + let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & + SIMD_ALIGNMENT_MASK) / 2; + if until_alignment + (STRIDE_SIZE / 2) <= len { + while until_alignment != 0 { + if buffer[offset] > 0xFF { + // This transition isn't optimal, since the aligment is recomputing + // but not tweaking further today. + if is_utf16_bidi_impl(&buffer[offset..]) { + return Latin1Bidi::Bidi; + } + return Latin1Bidi::LeftToRight; + } + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - (STRIDE_SIZE / 2); + loop { + let mut s = unsafe { *(src.offset(offset as isize) as *const u16x8) }; + if !simd_is_latin1(s) { + loop { + if is_u16x8_bidi(s) { + return Latin1Bidi::Bidi; + } + offset += STRIDE_SIZE / 2; + if offset > len_minus_stride { + for &u in &buffer[offset..] { + if is_utf16_code_unit_bidi(u) { + return Latin1Bidi::Bidi; + } + } + return Latin1Bidi::LeftToRight; + } + s = unsafe { *(src.offset(offset as isize) as *const u16x8) }; + } + } + offset += STRIDE_SIZE / 2; + if offset > len_minus_stride { + break; + } + } + } + } + let mut iter = (&buffer[offset..]).iter(); + loop { + if let Some(&u) = iter.next() { + if u > 0xFF { + let mut inner_u = u; + loop { + if is_utf16_code_unit_bidi(inner_u) { + return Latin1Bidi::Bidi; + } + if let Some(&code_unit) = iter.next() { + inner_u = code_unit; + } else { + return Latin1Bidi::LeftToRight; + } + } + } + } else { + return Latin1Bidi::Latin1; + } + } + } + } else { + #[inline(always)] + fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { + let mut offset = 0usize; + let len = buffer.len(); + if len >= ALIGNMENT / 2 { + let src = buffer.as_ptr(); + let mut until_alignment = ((ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & + ALIGNMENT_MASK) / 2; + if until_alignment + ALIGNMENT / 2 <= len { + while until_alignment != 0 { + if buffer[offset] > 0xFF { + if is_utf16_bidi_impl(&buffer[offset..]) { + return Latin1Bidi::Bidi; + } + return Latin1Bidi::LeftToRight; + } + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - ALIGNMENT / 2; + loop { + if unsafe { *(src.offset(offset as isize) as *const usize) } & LATIN1_MASK != 0 { + if is_utf16_bidi_impl(&buffer[offset..]) { + return Latin1Bidi::Bidi; + } + return Latin1Bidi::LeftToRight; + } + offset += ALIGNMENT / 2; + if offset > len_minus_stride { + break; + } + } + } + } + let mut iter = (&buffer[offset..]).iter(); + loop { + if let Some(&u) = iter.next() { + if u > 0xFF { + let mut inner_u = u; + loop { + if is_utf16_code_unit_bidi(inner_u) { + return Latin1Bidi::Bidi; + } + if let Some(&code_unit) = iter.next() { + inner_u = code_unit; + } else { + return Latin1Bidi::LeftToRight; + } + } + } + } else { + return Latin1Bidi::Latin1; + } + } + } + } +} + +/// Checks whether the buffer is all-ASCII. +/// +/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function +/// is not guaranteed to fail fast.) +#[inline] +pub fn is_ascii(buffer: &[u8]) -> bool { + is_ascii_impl(buffer) +} + +/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing +/// only ASCII characters). +/// +/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function +/// is not guaranteed to fail fast.) +#[inline] +pub fn is_basic_latin(buffer: &[u16]) -> bool { + is_basic_latin_impl(buffer) +} + +/// Checks whether the buffer is valid UTF-8 representing only code points +/// less than or equal to U+00FF. +/// +/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8 +/// invalidity or code points above U+00FF are discovered. +#[inline] +pub fn is_utf8_latin1(buffer: &[u8]) -> bool { + is_utf8_latin1_impl(buffer).is_none() +} + +/// Checks whether the buffer represents only code point less than or equal +/// to U+00FF. +/// +/// Fails fast. (I.e. returns before having read the whole buffer if code +/// points above U+00FF are discovered. +#[inline] +pub fn is_str_latin1(buffer: &str) -> bool { + is_str_latin1_impl(buffer).is_none() +} + +/// Checks whether the buffer represents only code point less than or equal +/// to U+00FF. +/// +/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function +/// is not guaranteed to fail fast.) +#[inline] +pub fn is_utf16_latin1(buffer: &[u16]) -> bool { + is_utf16_latin1_impl(buffer) +} + +/// Checks whether a potentially-invalid UTF-8 buffer contains code points +/// that trigger right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Additionally, the four +/// RIGHT-TO-LEFT FOO controls in General Punctuation are checked for. +/// Control characters that are technically bidi controls but do not cause +/// right-to-left behavior without the presence of right-to-left characters +/// or right-to-left controls are not checked for. +/// +/// Returns `true` if the input is invalid UTF-8 or the input contains an +/// RTL character. Returns `false` if the input is valid UTF-8 and contains +/// no RTL characters. +#[inline] +pub fn is_utf8_bidi(buffer: &[u8]) -> bool { + // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster + // than UTF-8 validation followed by `is_str_bidi()` for German, + // Russian and Japanese. However, this is considerably slower for Thai. + // Chances are that the compiler makes some branch predictions that are + // unfortunate for Thai. Not spending the time to manually optimize + // further at this time, since it's unclear if this variant even has + // use cases. However, this is worth revisiting once Rust gets the + // ability to annotate relative priorities of match arms. + + // U+058F: D6 8F + // U+0590: D6 90 + // U+08FF: E0 A3 BF + // U+0900: E0 A4 80 + // + // U+200F: E2 80 8F + // U+202B: E2 80 AB + // U+202E: E2 80 AE + // U+2067: E2 81 A7 + // + // U+FB4F: EF AD 8F + // U+FB50: EF AD 90 + // U+FDFF: EF B7 BF + // U+FE00: EF B8 80 + // + // U+FE6F: EF B9 AF + // U+FE70: EF B9 B0 + // U+FEFF: EF BB BF + // U+FF00: EF BC 80 + // + // U+107FF: F0 90 9F BF + // U+10800: F0 90 A0 80 + // U+10FFF: F0 90 BF BF + // U+11000: F0 91 80 80 + // + // U+1E7FF: F0 9E 9F BF + // U+1E800: F0 9E A0 80 + // U+1EFFF: F0 9E BF BF + // U+1F000: F0 9F 80 80 + let mut bytes = buffer; + 'outer: loop { + if let Some((mut byte, mut read)) = validate_ascii(bytes) { + // Check for the longest sequence to avoid checking twice for the + // multi-byte sequences. + if read + 4 <= bytes.len() { + 'inner: loop { + // At this point, `byte` is not included in `read`. + match byte { + 0...0x7F => { + // ASCII: go back to SIMD. + read += 1; + bytes = &bytes[read..]; + continue 'outer; + } + 0xC2...0xD5 => { + // Two-byte + let second = bytes[read + 1]; + if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 { + return true; + } + read += 2; + } + 0xD6 => { + // Two-byte + let second = bytes[read + 1]; + if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 { + return true; + } + // XXX consider folding the above and below checks + if second > 0x8F { + return true; + } + read += 2; + } + // two-byte starting with 0xD7 and above is bidi + 0xE1 | 0xE3...0xEC | 0xEE => { + // Three-byte normal + let second = bytes[read + 1]; + let third = bytes[read + 2]; + if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + read += 3; + } + 0xE2 => { + // Three-byte normal, potentially bidi + let second = bytes[read + 1]; + let third = bytes[read + 2]; + if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + if second == 0x80 { + if third == 0x8F || third == 0xAB || third == 0xAE { + return true; + } + } else if second == 0x81 { + if third == 0xA7 { + return true; + } + } + read += 3; + } + 0xEF => { + // Three-byte normal, potentially bidi + let second = bytes[read + 1]; + let third = bytes[read + 2]; + if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + if in_inclusive_range8(second, 0xAD, 0xB7) { + if second == 0xAD { + if third > 0x8F { + return true; + } + } else { + return true; + } + } else if in_inclusive_range8(second, 0xB9, 0xBB) { + if second == 0xB9 { + if third > 0xAF { + return true; + } + } else { + return true; + } + } + read += 3; + } + 0xE0 => { + // Three-byte special lower bound, potentially bidi + let second = bytes[read + 1]; + let third = bytes[read + 2]; + if ((UTF8_TRAIL_INVALID[second as usize] & + UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + // XXX can this be folded into the above validity check + if second < 0xA4 { + return true; + } + read += 3; + } + 0xED => { + // Three-byte special upper bound + let second = bytes[read + 1]; + let third = bytes[read + 2]; + if ((UTF8_TRAIL_INVALID[second as usize] & + UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + read += 3; + } + 0xF1...0xF3 => { + // Four-byte normal + let second = bytes[read + 1]; + let third = bytes[read + 2]; + let fourth = bytes[read + 3]; + if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL) | + (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + read += 4; + } + 0xF0 => { + // Four-byte special lower bound, potentially bidi + let second = bytes[read + 1]; + let third = bytes[read + 2]; + let fourth = bytes[read + 3]; + if ((UTF8_TRAIL_INVALID[second as usize] & + UTF8_FOUR_BYTE_SPECIAL_LOWER_BOUND_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL) | + (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + if unsafe { unlikely(second == 0x90 || second == 0x9E) } { + let third = bytes[read + 2]; + if third >= 0xA0 { + return true; + } + } + read += 4; + } + 0xF4 => { + // Four-byte special upper bound + let second = bytes[read + 1]; + let third = bytes[read + 2]; + let fourth = bytes[read + 3]; + if ((UTF8_TRAIL_INVALID[second as usize] & + UTF8_FOUR_BYTE_SPECIAL_UPPER_BOUND_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL) | + (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + read += 4; + } + _ => { + // Invalid lead or bidi-only lead + return true; + } + } + if read + 4 > bytes.len() { + if read == bytes.len() { + return false; + } + byte = bytes[read]; + break 'inner; + } + byte = bytes[read]; + continue 'inner; + } + } + // We can't have a complete 4-byte sequence, but we could still have + // a complete shorter sequence. + + // At this point, `byte` is not included in `read`. + match byte { + 0...0x7F => { + // ASCII: go back to SIMD. + read += 1; + bytes = &bytes[read..]; + continue 'outer; + } + 0xC2...0xD5 => { + // Two-byte + let new_read = read + 2; + if new_read > bytes.len() { + return true; + } + let second = bytes[read + 1]; + if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 { + return true; + } + read = new_read; + // We need to deal with the case where we came here with 3 bytes + // left, so we need to take a look at the last one. + bytes = &bytes[read..]; + continue 'outer; + } + 0xD6 => { + // Two-byte, potentially bidi + let new_read = read + 2; + if new_read > bytes.len() { + return true; + } + let second = bytes[read + 1]; + if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 { + return true; + } + // XXX consider folding the above and below checks + if second > 0x8F { + return true; + } + read = new_read; + // We need to deal with the case where we came here with 3 bytes + // left, so we need to take a look at the last one. + bytes = &bytes[read..]; + continue 'outer; + } + // two-byte starting with 0xD7 and above is bidi + 0xE1 | 0xE3...0xEC | 0xEE => { + // Three-byte normal + let new_read = read + 3; + if new_read > bytes.len() { + return true; + } + let second = bytes[read + 1]; + let third = bytes[read + 2]; + if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + } + 0xE2 => { + // Three-byte normal, potentially bidi + let new_read = read + 3; + if new_read > bytes.len() { + return true; + } + let second = bytes[read + 1]; + let third = bytes[read + 2]; + if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + if second == 0x80 { + if third == 0x8F || third == 0xAB || third == 0xAE { + return true; + } + } else if second == 0x81 { + if third == 0xA7 { + return true; + } + } + } + 0xEF => { + // Three-byte normal, potentially bidi + let new_read = read + 3; + if new_read > bytes.len() { + return true; + } + let second = bytes[read + 1]; + let third = bytes[read + 2]; + if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + if in_inclusive_range8(second, 0xAD, 0xB7) { + if second == 0xAD { + if third > 0x8F { + return true; + } + } else { + return true; + } + } else if in_inclusive_range8(second, 0xB9, 0xBB) { + if second == 0xB9 { + if third > 0xAF { + return true; + } + } else { + return true; + } + } + } + 0xE0 => { + // Three-byte special lower bound, potentially bidi + let new_read = read + 3; + if new_read > bytes.len() { + return true; + } + let second = bytes[read + 1]; + let third = bytes[read + 2]; + if ((UTF8_TRAIL_INVALID[second as usize] & + UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + // XXX can this be folded into the above validity check + if second < 0xA4 { + return true; + } + } + 0xED => { + // Three-byte special upper bound + let new_read = read + 3; + if new_read > bytes.len() { + return true; + } + let second = bytes[read + 1]; + let third = bytes[read + 2]; + if ((UTF8_TRAIL_INVALID[second as usize] & + UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL) | + (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)) != + 0 { + return true; + } + } + _ => { + // Invalid lead, 4-byte lead or 2-byte bidi-only lead + return true; + } + } + return false; + } else { + return false; + } + } +} + +/// Checks whether a valid UTF-8 buffer contains code points that trigger +/// right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Additionally, the four +/// RIGHT-TO-LEFT FOO controls in General Punctuation are checked for. +/// Control characters that are technically bidi controls but do not cause +/// right-to-left behavior without the presence of right-to-left characters +/// or right-to-left controls are not checked for. +#[inline] +pub fn is_str_bidi(buffer: &str) -> bool { + // U+058F: D6 8F + // U+0590: D6 90 + // U+08FF: E0 A3 BF + // U+0900: E0 A4 80 + // + // U+200F: E2 80 8F + // U+202B: E2 80 AB + // U+202E: E2 80 AE + // U+2067: E2 81 A7 + // + // U+FB4F: EF AD 8F + // U+FB50: EF AD 90 + // U+FDFF: EF B7 BF + // U+FE00: EF B8 80 + // + // U+FE6F: EF B9 AF + // U+FE70: EF B9 B0 + // U+FEFF: EF BB BF + // U+FF00: EF BC 80 + // + // U+107FF: F0 90 9F BF + // U+10800: F0 90 A0 80 + // U+10FFF: F0 90 BF BF + // U+11000: F0 91 80 80 + // + // U+1E7FF: F0 9E 9F BF + // U+1E800: F0 9E A0 80 + // U+1EFFF: F0 9E BF BF + // U+1F000: F0 9F 80 80 + let mut bytes = buffer.as_bytes(); + 'outer: loop { + // TODO: Instead of just validating ASCII using SIMD, use SIMD + // to check for non-ASCII lead bytes, too, to quickly conclude + // that the vector consist entirely of CJK and below-Hebrew + // code points. + // Unfortunately, scripts above Arabic but below CJK share + // lead bytes with RTL. + if let Some((mut byte, mut read)) = validate_ascii(bytes) { + 'inner: loop { + // At this point, `byte` is not included in `read`. + if byte < 0xE0 { + if byte >= 0x80 { + // Two-byte + // Adding `unlikely` here improved throughput on + // Russian plain text by 33%! + if unsafe { unlikely(byte >= 0xD6) } { + if byte == 0xD6 { + let second = bytes[read + 1]; + if second > 0x8F { + return true; + } + } else { + return true; + } + } + read += 2; + } else { + // ASCII: write and go back to SIMD. + read += 1; + // Intuitively, we should go back to the outer loop only + // if byte is 0x30 or above, so as to avoid trashing on + // ASCII space, comma and period in non-Latin context. + // However, the extra branch seems to cost more than it's + // worth. + bytes = &bytes[read..]; + continue 'outer; + } + } else if byte < 0xF0 { + // Three-byte + if unsafe { unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) } { + let second = bytes[read + 1]; + if byte == 0xE0 { + if second < 0xA4 { + return true; + } + } else if byte == 0xE2 { + let third = bytes[read + 2]; + if second == 0x80 { + if third == 0x8F || third == 0xAB || third == 0xAE { + return true; + } + } else if second == 0x81 { + if third == 0xA7 { + return true; + } + } + } else { + debug_assert_eq!(byte, 0xEF); + if in_inclusive_range8(second, 0xAD, 0xB7) { + if second == 0xAD { + let third = bytes[read + 2]; + if third > 0x8F { + return true; + } + } else { + return true; + } + } else if in_inclusive_range8(second, 0xB9, 0xBB) { + if second == 0xB9 { + let third = bytes[read + 2]; + if third > 0xAF { + return true; + } + } else { + return true; + } + } + } + } + read += 3; + } else { + // Four-byte + let second = bytes[read + 1]; + if unsafe { unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) } { + let third = bytes[read + 2]; + if third >= 0xA0 { + return true; + } + } + read += 4; + } + // The comparison is always < or == and never >, but including + // > here to let the compiler assume that < is true if this + // comparison is false. + if read >= bytes.len() { + return false; + } + byte = bytes[read]; + continue 'inner; + } + } else { + return false; + } + } +} + +/// Checks whether a UTF-16 buffer contains code points that trigger +/// right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Additionally, the four +/// RIGHT-TO-LEFT FOO controls in General Punctuation are checked for. +/// Control characters that are technically bidi controls but do not cause +/// right-to-left behavior without the presence of right-to-left characters +/// or right-to-left controls are not checked for. +/// +/// Returns `true` if the input contains an RTL character or an unpaired +/// high surrogate that could be the high half of an RTL character. +/// Returns `false` if teh input contains neither RTL characters nor +/// unpaired high surrogates that could be higher halves of RTL characters. +#[inline] +pub fn is_utf16_bidi(buffer: &[u16]) -> bool { + is_utf16_bidi_impl(buffer) +} + +/// Checks whether a code point triggers right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Additionally, the four +/// RIGHT-TO-LEFT FOO controls in General Punctuation are checked for. +/// Control characters that are technically bidi controls but do not cause +/// right-to-left behavior without the presence of right-to-left characters +/// or right-to-left controls are not checked for. +#[inline(always)] +pub fn is_char_bidi(c: char) -> bool { + // Controls: + // Every control with RIGHT-TO-LEFT in its name in + // https://www.unicode.org/charts/PDF/U2000.pdf + // U+200F RLM + // U+202B RLE + // U+202E RLO + // U+2067 RLI + // + // BMP RTL: + // https://www.unicode.org/roadmaps/bmp/ + // U+0590...U+08FF + // U+FB50...U+FDFF Arabic Presentation Forms A + // U+FE70...U+FEFF Arabic Presentation Forms B + // + // Supplementary RTL: + // https://www.unicode.org/roadmaps/smp/ + // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803) + // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B) + let code_point = c as u32; + if code_point < 0x0590 { + // Below Hebrew + return false; + } + if in_range32(code_point, 0x0900, 0xFB50) { + // Above Arabic Extended-A and below Arabic Presentation Forms + if in_inclusive_range32(code_point, 0x200F, 0x2067) { + // In the range that contains the RTL controls + return code_point == 0x200F || code_point == 0x202B || code_point == 0x202E || + code_point == 0x2067; + } + return false; + } + if code_point > 0x1EFFF { + // Above second astral RTL. (Emoji is here.) + return false; + } + if in_range32(code_point, 0x11000, 0x1E800) { + // Between astral RTL blocks + return false; + } + if in_range32(code_point, 0xFF00, 0x10800) { + // Above Arabic Presentations Forms B and below first + // astral RTL + return false; + } + if in_range32(code_point, 0xFE00, 0xFE70) { + // Between Arabic Presentations Forms + return false; + } + true +} + +/// Checks whether a UTF-16 code unit triggers right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Additionally, the four +/// RIGHT-TO-LEFT FOO controls in General Punctuation are checked for. +/// Control characters that are technically bidi controls but do not cause +/// right-to-left behavior without the presence of right-to-left characters +/// or right-to-left controls are not checked for. +/// +/// Since supplementary-plane right-to-left blocks are identifiable from the +/// high surrogate without examining the low surrogate, this function returns +/// `true` for such high surrogates making the function suitable for handling +/// supplementary-plane text without decoding surrogate pairs to scalar +/// values. Obviously, such high surrogates are then reported as right-to-left +/// even if actually unpaired. +#[inline(always)] +pub fn is_utf16_code_unit_bidi(u: u16) -> bool { + if u < 0x0590 { + // Below Hebrew + return false; + } + if in_range16(u, 0x0900, 0xD802) { + // Above Arabic Extended-A and below first RTL surrogate + if in_inclusive_range16(u, 0x200F, 0x2067) { + // In the range that contains the RTL controls + return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067; + } + return false; + } + if in_range16(u, 0xD83C, 0xFB50) { + // Between astral RTL high surrogates and Arabic Presentation Forms + // (Emoji is here) + return false; + } + if in_range16(u, 0xD804, 0xD83A) { + // Between RTL high surragates + return false; + } + if u > 0xFEFF { + // Above Arabic Presentation Forms + return false; + } + if in_range16(u, 0xFE00, 0xFE70) { + // Between Arabic Presentations Forms + return false; + } + true +} + +/// Checks whether a potentially invalid UTF-8 buffer contains code points +/// that trigger right-to-left processing or is all-Latin1. +/// +/// Possibly more efficient than performing the checks separately. +/// +/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. +/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return +/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. +#[inline] +pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi { + if let Some(offset) = is_utf8_latin1_impl(buffer) { + if is_utf8_bidi(&buffer[offset..]) { + Latin1Bidi::Bidi + } else { + Latin1Bidi::LeftToRight + } + } else { + Latin1Bidi::Latin1 + } +} + +/// Checks whether a valid UTF-8 buffer contains code points +/// that trigger right-to-left processing or is all-Latin1. +/// +/// Possibly more efficient than performing the checks separately. +/// +/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`. +/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return +/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. +#[inline] +pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi { + // The transition from the latin1 check to the bidi check isn't + // optimal but not tweaking it to perfection today. + if let Some(offset) = is_str_latin1_impl(buffer) { + if is_str_bidi(&buffer[offset..]) { + Latin1Bidi::Bidi + } else { + Latin1Bidi::LeftToRight + } + } else { + Latin1Bidi::Latin1 + } +} + +/// Checks whether a potentially invalid UTF-16 buffer contains code points +/// that trigger right-to-left processing or is all-Latin1. +/// +/// Possibly more efficient than performing the checks separately. +/// +/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. +/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return +/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. +#[inline] +pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi { + check_utf16_for_latin1_and_bidi_impl(buffer) +} + +/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced +/// with the REPLACEMENT CHARACTER. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer _plus one_. +/// +/// Returns the number of `u16`s written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize { + // TODO: Can the + 1 be eliminated? + assert!(dst.len() >= src.len() + 1); + let mut decoder = Utf8Decoder::new_inner(); + let mut total_read = 0usize; + let mut total_written = 0usize; + loop { + let (result, read, written) = + decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true); + total_read += read; + total_written += written; + match result { + DecoderResult::InputEmpty => { + return total_written; + } + DecoderResult::OutputFull => { + unreachable!("The assert at the top of the function should have caught this."); + } + DecoderResult::Malformed(_, _) => { + // There should always be space for the U+FFFD, because + // otherwise we'd have gotten OutputFull already. + dst[total_written] = 0xFFFD; + total_written += 1; + } + } + } +} + +/// Converts valid UTF-8 to valid UTF-16. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of `u16`s written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + let bytes = src.as_bytes(); + let mut read = 0; + let mut written = 0; + 'outer: loop { + let mut byte = { + let src_remaining = &bytes[read..]; + let dst_remaining = &mut dst[written..]; + let length = src_remaining.len(); + match unsafe { + ascii_to_basic_latin( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr(), + length, + ) + } { + None => { + written += length; + return written; + } + Some((non_ascii, consumed)) => { + read += consumed; + written += consumed; + non_ascii + } + } + }; + 'inner: loop { + // At this point, `byte` is not included in `read`. + if byte < 0xE0 { + if byte >= 0x80 { + // Two-byte + let second = bytes[read + 1]; + let point = (((byte as u32) & 0x1Fu32) << 6) | (second as u32 & 0x3Fu32); + dst[written] = point as u16; + read += 2; + written += 1; + } else { + // ASCII: write and go back to SIMD. + dst[written] = byte as u16; + read += 1; + written += 1; + // Intuitively, we should go back to the outer loop only + // if byte is 0x30 or above, so as to avoid trashing on + // ASCII space, comma and period in non-Latin context. + // However, the extra branch seems to cost more than it's + // worth. + continue 'outer; + } + } else if byte < 0xF0 { + // Three-byte + let second = bytes[read + 1]; + let third = bytes[read + 2]; + let point = (((byte as u32) & 0xFu32) << 12) | ((second as u32 & 0x3Fu32) << 6) | + (third as u32 & 0x3Fu32); + dst[written] = point as u16; + read += 3; + written += 1; + } else { + // Four-byte + let second = bytes[read + 1]; + let third = bytes[read + 2]; + let fourth = bytes[read + 3]; + let point = (((byte as u32) & 0x7u32) << 18) | ((second as u32 & 0x3Fu32) << 12) | + ((third as u32 & 0x3Fu32) << 6) | + (fourth as u32 & 0x3Fu32); + dst[written] = (0xD7C0 + (point >> 10)) as u16; + dst[written + 1] = (0xDC00 + (point & 0x3FF)) as u16; + read += 4; + written += 2; + } + // The comparison is always < or == and never >, but including + // > here to let the compiler assume that < is true if this + // comparison is false. + if read >= src.len() { + return written; + } + byte = bytes[read]; + continue 'inner; + } + } +} + +/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced +/// with the REPLACEMENT CHARACTER. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer times three _plus one_. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Safety +/// +/// Note that this function may write garbage beyond the number of bytes +/// indicated by the return value, so using a `&mut str` interpreted as +/// `&mut [u8]` as the destination is not safe. If you want to convert into +/// a `&mut str`, use `convert_utf16_to_str()` instead of this function. +#[inline] +pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize { + assert!(dst.len() >= src.len() * 3 + 1); + let mut encoder = Utf8Encoder; + let (result, _, written) = encoder.encode_from_utf16_raw(src, dst, true); + debug_assert!(result == EncoderResult::InputEmpty); + written +} + +/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced +/// with the REPLACEMENT CHARACTER such that the validity of the output is +/// signaled using the Rust type system. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer times three _plus one_. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize { + let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) }; + let written = convert_utf16_to_utf8(src, bytes); + let len = bytes.len(); + let mut trail = written; + let max = ::std::cmp::min(len, trail + STRIDE_SIZE); + while trail < max { + bytes[trail] = 0; + trail += 1; + } + while trail < len && ((bytes[trail] & 0xC0) == 0x80) { + bytes[trail] = 0; + trail += 1; + } + written +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-16. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// The number of `u16`s written equals the length of the source buffer. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + // TODO: On aarch64, the safe version autovectorizes to the same unpacking + // instructions and this code, but, yet, the autovectorized version is + // faster. + unsafe { + unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len()); + } +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer times two. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Safety +/// +/// Note that this function may write garbage beyond the number of bytes +/// indicated by the return value, so using a `&mut str` interpreted as +/// `&mut [u8]` as the destination is not safe. If you want to convert into +/// a `&mut str`, use `convert_utf16_to_str()` instead of this function. +#[inline] +pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize { + assert!( + dst.len() >= src.len() * 2, + "Destination must not be shorter than the source times two." + ); + let src_len = src.len(); + let src_ptr = src.as_ptr(); + let dst_ptr = dst.as_mut_ptr(); + let mut total_read = 0usize; + let mut total_written = 0usize; + loop { + // src can't advance more than dst + let src_left = src_len - total_read; + if let Some((non_ascii, consumed)) = + unsafe { + ascii_to_ascii( + src_ptr.offset(total_read as isize), + dst_ptr.offset(total_written as isize), + src_left, + ) + } { + total_read += consumed + 1; + total_written += consumed; + + let code_point = non_ascii as u32; + dst[total_written] = ((code_point >> 6) | 0xC0u32) as u8; + total_written += 1; + dst[total_written] = ((code_point as u32 & 0x3Fu32) | 0x80u32) as u8; + total_written += 1; + continue; + } + return total_written + src_left; + } +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the +/// output is signaled using the Rust type system. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer times two. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize { + let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) }; + let written = convert_latin1_to_utf8(src, bytes); + let len = bytes.len(); + let mut trail = written; + let max = ::std::cmp::min(len, trail + STRIDE_SIZE); + while trail < max { + bytes[trail] = 0; + trail += 1; + } + while trail < len && ((bytes[trail] & 0xC0) == 0x80) { + bytes[trail] = 0; + trail += 1; + } + written +} + +/// If the input is valid UTF-8 representing only Unicode code points from +/// U+0000 to U+00FF, inclusive, converts the input into output that +/// represents the value of each code point as the unsigned byte value of +/// each output byte. +/// +/// If the input does not fulfill the condition stated above, this function +/// does something that is memory-safe without any promises about any +/// properties of the output. In particular, callers shouldn't assume the +/// output to be the same across crate versions or CPU architectures and +/// should not assume that non-ASCII input can't map to ASCII output. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + let src_len = src.len(); + let src_ptr = src.as_ptr(); + let dst_ptr = dst.as_mut_ptr(); + let mut total_read = 0usize; + let mut total_written = 0usize; + loop { + // dst can't advance more than src + let src_left = src_len - total_read; + if let Some((non_ascii, consumed)) = + unsafe { + ascii_to_ascii( + src_ptr.offset(total_read as isize), + dst_ptr.offset(total_written as isize), + src_left, + ) + } { + total_read += consumed + 1; + total_written += consumed; + + if total_read == src_len { + return total_written; + } + + let trail = src[total_read]; + total_read += 1; + + dst[total_written] = (((non_ascii as u32 & 0x1Fu32) << 6) | + (trail as u32 & 0x3Fu32)) as u8; + total_written += 1; + continue; + } + return total_written + src_left; + } +} + +/// If the input is valid UTF-16 representing only Unicode code points from +/// U+0000 to U+00FF, inclusive, converts the input into output that +/// represents the value of each code point as the unsigned byte value of +/// each output byte. +/// +/// If the input does not fulfill the condition stated above, this function +/// does something that is memory-safe without any promises about any +/// properties of the output. In particular, callers shouldn't assume the +/// output to be the same across crate versions or CPU architectures and +/// should not assume that non-Basic Latin input can't map to ASCII output. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// The number of bytes written equals the length of the source buffer. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + unsafe { + pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len()); + } +} + +/// Returns the index of the first unpaired surrogate or, if the input is +/// valid UTF-16 in its entirety, the length of the input. +#[inline] +pub fn utf16_valid_up_to(buffer: &[u16]) -> usize { + utf16_valid_up_to_impl(buffer) +} + +/// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. +#[inline] +pub fn ensure_utf16_validity(buffer: &mut [u16]) { + let mut offset = 0; + loop { + offset += utf16_valid_up_to(&buffer[offset..]); + if offset == buffer.len() { + return; + } + buffer[offset] = 0xFFFD; + offset += 1; + } +} + +/// Copies ASCII from source to destination up to the first non-ASCII byte +/// (or the end of the input if it is ASCII in its entirety). +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + if let Some((_, consumed)) = + unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) } { + consumed + } else { + src.len() + } +} + +/// Copies ASCII from source to destination zero-extending it to UTF-16 up to +/// the first non-ASCII byte (or the end of the input if it is ASCII in its +/// entirety). +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of `u16`s written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + if let Some((_, consumed)) = + unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) } { + consumed + } else { + src.len() + } +} + +/// Copies Basic Latin from source to destination narrowing it to ASCII up to +/// the first non-Basic Latin code unit (or the end of the input if it is +/// Basic Latin in its entirety). +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + if let Some((_, consumed)) = + unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) } { + consumed + } else { + src.len() + } +} + +// Any copyright to the test code below this comment is dedicated to the +// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_is_ascii_success() { + let mut src: Vec = Vec::with_capacity(128); + src.resize(128, 0); + for i in 0..src.len() { + src[i] = i as u8; + } + for i in 0..src.len() { + assert!(is_ascii(&src[i..])); + } + } + + #[test] + fn test_is_ascii_fail() { + let mut src: Vec = Vec::with_capacity(128); + src.resize(128, 0); + for i in 0..src.len() { + src[i] = i as u8; + } + for i in 0..src.len() { + let tail = &mut src[i..]; + for j in 0..tail.len() { + tail[j] = 0xA0; + assert!(!is_ascii(tail)); + } + } + } + + #[test] + fn test_is_basic_latin_success() { + let mut src: Vec = Vec::with_capacity(128); + src.resize(128, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + assert!(is_basic_latin(&src[i..])); + } + } + + #[test] + fn test_is_basic_latin_fail() { + let mut src: Vec = Vec::with_capacity(128); + src.resize(128, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let tail = &mut src[i..]; + for j in 0..tail.len() { + tail[j] = 0xA0; + assert!(!is_basic_latin(tail)); + } + } + } + + #[test] + fn test_is_utf16_latin1_success() { + let mut src: Vec = Vec::with_capacity(256); + src.resize(256, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + assert!(is_utf16_latin1(&src[i..])); + assert_eq!( + check_utf16_for_latin1_and_bidi(&src[i..]), + Latin1Bidi::Latin1 + ); + } + } + + #[test] + fn test_is_utf16_latin1_fail() { + let mut src: Vec = Vec::with_capacity(256); + src.resize(256, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let tail = &mut src[i..]; + for j in 0..tail.len() { + tail[j] = 0x100 + j as u16; + assert!(!is_utf16_latin1(tail)); + assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1); + } + } + } + + #[test] + fn test_is_str_latin1_success() { + let mut src: Vec = Vec::with_capacity(256); + src.resize(256, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let s = String::from_utf16(&src[i..]).unwrap(); + assert!(is_str_latin1(&s[..])); + assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); + } + } + + #[test] + fn test_is_str_latin1_fail() { + let mut src: Vec = Vec::with_capacity(256); + src.resize(256, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let tail = &mut src[i..]; + for j in 0..tail.len() { + tail[j] = 0x100 + j as u16; + let s = String::from_utf16(tail).unwrap(); + assert!(!is_str_latin1(&s[..])); + assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); + } + } + } + + #[test] + fn test_is_utf8_latin1_success() { + let mut src: Vec = Vec::with_capacity(256); + src.resize(256, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let s = String::from_utf16(&src[i..]).unwrap(); + assert!(is_utf8_latin1(s.as_bytes())); + assert_eq!( + check_utf8_for_latin1_and_bidi(s.as_bytes()), + Latin1Bidi::Latin1 + ); + } + } + + #[test] + fn test_is_utf8_latin1_fail() { + let mut src: Vec = Vec::with_capacity(256); + src.resize(256, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let tail = &mut src[i..]; + for j in 0..tail.len() { + tail[j] = 0x100 + j as u16; + let s = String::from_utf16(tail).unwrap(); + assert!(!is_utf8_latin1(s.as_bytes())); + assert_ne!( + check_utf8_for_latin1_and_bidi(s.as_bytes()), + Latin1Bidi::Latin1 + ); + } + } + } + + #[test] + fn test_is_utf8_latin1_invalid() { + assert!(!is_utf8_latin1(b"\xC3")); + assert!(!is_utf8_latin1(b"a\xC3")); + assert!(!is_utf8_latin1(b"\xFF")); + assert!(!is_utf8_latin1(b"a\xFF")); + assert!(!is_utf8_latin1(b"\xC3\xFF")); + assert!(!is_utf8_latin1(b"a\xC3\xFF")); + } + + #[test] + fn test_convert_utf8_to_utf16() { + let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; + let mut dst: Vec = Vec::with_capacity(src.len() + 1); + dst.resize(src.len() + 1, 0); + let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]); + dst.truncate(len); + let reference: Vec = src.encode_utf16().collect(); + assert_eq!(dst, reference); + } + + #[test] + fn test_convert_str_to_utf16() { + let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; + let mut dst: Vec = Vec::with_capacity(src.len()); + dst.resize(src.len(), 0); + let len = convert_str_to_utf16(src, &mut dst[..]); + dst.truncate(len); + let reference: Vec = src.encode_utf16().collect(); + assert_eq!(dst, reference); + } + + #[test] + fn test_convert_utf16_to_utf8() { + let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; + let src: Vec = reference.encode_utf16().collect(); + let mut dst: Vec = Vec::with_capacity(src.len() * 3 + 1); + dst.resize(src.len() * 3 + 1, 0); + let len = convert_utf16_to_utf8(&src[..], &mut dst[..]); + dst.truncate(len); + assert_eq!(dst, reference.as_bytes()); + } + + #[test] + fn test_convert_latin1_to_utf16() { + let mut src: Vec = Vec::with_capacity(256); + src.resize(256, 0); + let mut reference: Vec = Vec::with_capacity(256); + reference.resize(256, 0); + for i in 0..256 { + src[i] = i as u8; + reference[i] = i as u16; + } + let mut dst: Vec = Vec::with_capacity(src.len()); + dst.resize(src.len(), 0); + convert_latin1_to_utf16(&src[..], &mut dst[..]); + assert_eq!(dst, reference); + } + + #[test] + fn test_convert_latin1_to_utf8() { + let mut src: Vec = Vec::with_capacity(256); + src.resize(256, 0); + let mut reference: Vec = Vec::with_capacity(256); + reference.resize(256, 0); + for i in 0..256 { + src[i] = i as u8; + reference[i] = i as u16; + } + let s = String::from_utf16(&reference[..]).unwrap(); + let mut dst: Vec = Vec::with_capacity(src.len() * 2); + dst.resize(src.len() * 2, 0); + let len = convert_latin1_to_utf8(&src[..], &mut dst[..]); + dst.truncate(len); + assert_eq!(&dst[..], s.as_bytes()); + } + + #[test] + fn test_convert_utf8_to_latin1_lossy() { + let mut reference: Vec = Vec::with_capacity(256); + reference.resize(256, 0); + let mut src16: Vec = Vec::with_capacity(256); + src16.resize(256, 0); + for i in 0..256 { + src16[i] = i as u16; + reference[i] = i as u8; + } + let src = String::from_utf16(&src16[..]).unwrap(); + let mut dst: Vec = Vec::with_capacity(src.len()); + dst.resize(src.len(), 0); + let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]); + dst.truncate(len); + assert_eq!(dst, reference); + } + + #[test] + fn test_convert_utf16_to_latin1_lossy() { + let mut src: Vec = Vec::with_capacity(256); + src.resize(256, 0); + let mut reference: Vec = Vec::with_capacity(256); + reference.resize(256, 0); + for i in 0..256 { + src[i] = i as u16; + reference[i] = i as u8; + } + let mut dst: Vec = Vec::with_capacity(src.len()); + dst.resize(src.len(), 0); + convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]); + assert_eq!(dst, reference); + } + + #[test] + fn test_utf16_valid_up_to() { + let valid = vec![0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0x2603u16, 0xD83Du16, 0xDCA9u16, 0x00B6u16]; + assert_eq!(utf16_valid_up_to(&valid[..]), 16);; + let lone_high = vec![0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0u16, 0u16, 0x2603u16, 0xD83Du16, 0x00B6u16]; + assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);; + let lone_low = vec![0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0u16, 0u16, 0x2603u16, 0xDCA9u16, 0x00B6u16]; + assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);; + let lone_high_at_end = vec![0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0u16, 0u16, 0u16, 0x2603u16, 0x00B6u16, 0xD83Du16]; + assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);; + } + + #[test] + fn test_ensure_utf16_validity() { + let mut src = vec![0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, + 0u16, 0u16, 0u16, 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16]; + let reference = vec![0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, + 0u16, 0u16, 0u16, 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0u16]; + ensure_utf16_validity(&mut src[..]); + assert_eq!(src, reference); + } + + #[test] + fn test_is_char_bidi() { + assert!(!is_char_bidi('a')); + assert!(!is_char_bidi('\u{03B1}')); + assert!(!is_char_bidi('\u{3041}')); + assert!(!is_char_bidi('\u{1F4A9}')); + assert!(!is_char_bidi('\u{FE00}')); + assert!(!is_char_bidi('\u{202C}')); + assert!(is_char_bidi('\u{0590}')); + assert!(is_char_bidi('\u{08FF}')); + assert!(is_char_bidi('\u{061C}')); + assert!(is_char_bidi('\u{FB50}')); + assert!(is_char_bidi('\u{FDFF}')); + assert!(is_char_bidi('\u{FE70}')); + assert!(is_char_bidi('\u{FEFF}')); + assert!(is_char_bidi('\u{200F}')); + assert!(is_char_bidi('\u{202B}')); + assert!(is_char_bidi('\u{202E}')); + assert!(is_char_bidi('\u{2067}')); + assert!(is_char_bidi('\u{10800}')); + assert!(is_char_bidi('\u{10FFF}')); + assert!(is_char_bidi('\u{1E800}')); + assert!(is_char_bidi('\u{1EFFF}')); + } + + #[test] + fn test_is_utf16_code_unit_bidi() { + assert!(!is_utf16_code_unit_bidi(0x0062)); + assert!(!is_utf16_code_unit_bidi(0x03B1)); + assert!(!is_utf16_code_unit_bidi(0x3041)); + assert!(!is_utf16_code_unit_bidi(0xD801)); + assert!(!is_utf16_code_unit_bidi(0xFE00)); + assert!(!is_utf16_code_unit_bidi(0x202C)); + assert!(is_utf16_code_unit_bidi(0x0590)); + assert!(is_utf16_code_unit_bidi(0x08FF)); + assert!(is_utf16_code_unit_bidi(0x061C)); + assert!(is_utf16_code_unit_bidi(0xFB50)); + assert!(is_utf16_code_unit_bidi(0xFDFF)); + assert!(is_utf16_code_unit_bidi(0xFE70)); + assert!(is_utf16_code_unit_bidi(0xFEFF)); + assert!(is_utf16_code_unit_bidi(0x200F)); + assert!(is_utf16_code_unit_bidi(0x202B)); + assert!(is_utf16_code_unit_bidi(0x202E)); + assert!(is_utf16_code_unit_bidi(0x2067)); + assert!(is_utf16_code_unit_bidi(0xD802)); + assert!(is_utf16_code_unit_bidi(0xD803)); + assert!(is_utf16_code_unit_bidi(0xD83A)); + assert!(is_utf16_code_unit_bidi(0xD83B)); + } + + #[test] + fn test_is_str_bidi() { + assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop")); + } + + #[test] + fn test_is_utf8_bidi() { + assert!(!is_utf8_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes())); + assert!(!is_utf8_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes())); + assert!(!is_utf8_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes())); + assert!(!is_utf8_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes())); + assert!(!is_utf8_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes())); + assert!(!is_utf8_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes())); + assert!(is_utf8_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes())); + } + + #[test] + fn test_is_utf16_bidi() { + assert!( + !is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + !is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + !is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + !is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + !is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + !is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ) + ); + + assert!( + is_utf16_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, + 0x64, 0x65, 0x66, 0x67, 0x68, 0x69] + ) + ); + } + + #[test] + fn test_check_str_for_latin1_and_bidi() { + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + } + + #[test] + fn test_check_utf8_for_latin1_and_bidi() { + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + } + + #[test] + fn test_check_utf16_for_latin1_and_bidi() { + assert_ne!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + + assert_eq!( + check_utf16_for_latin1_and_bidi( + &[0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, + 0x64, 0x65, 0x66, 0x67, 0x68, 0x69] + ), + Latin1Bidi::Bidi + ); + } + + #[inline(always)] + pub fn reference_is_char_bidi(c: char) -> bool { + match c { + '\u{0590}'...'\u{08FF}' | + '\u{FB50}'...'\u{FDFF}' | + '\u{FE70}'...'\u{FEFF}' | + '\u{10800}'...'\u{10FFF}' | + '\u{1E800}'...'\u{1EFFF}' | + '\u{200F}' | + '\u{202B}' | + '\u{202E}' | + '\u{2067}' => true, + _ => false, + } + } + + #[inline(always)] + pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool { + match u { + 0x0590...0x08FF | 0xFB50...0xFDFF | 0xFE70...0xFEFF | 0xD802 | 0xD803 | 0xD83A | + 0xD83B | 0x200F | 0x202B | 0x202E | 0x2067 => true, + _ => false, + } + } + + #[test] + fn test_is_char_bidi_thoroughly() { + for i in 0..0xD800u32 { + let c: char = unsafe { ::std::mem::transmute(i) }; + assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); + } + for i in 0xE000..0x110000u32 { + let c: char = unsafe { ::std::mem::transmute(i) }; + assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); + } + } + + #[test] + fn test_is_utf16_code_unit_bidi_thoroughly() { + for i in 0..0x10000u32 { + let u = i as u16; + assert_eq!( + is_utf16_code_unit_bidi(u), + reference_is_utf16_code_unit_bidi(u) + ); + } + } + + #[test] + fn test_is_str_bidi_thoroughly() { + let mut buf = [0; 4]; + for i in 0..0xD800u32 { + let c: char = unsafe { ::std::mem::transmute(i) }; + assert_eq!( + is_str_bidi(c.encode_utf8(&mut buf[..])), + reference_is_char_bidi(c) + ); + } + for i in 0xE000..0x110000u32 { + let c: char = unsafe { ::std::mem::transmute(i) }; + assert_eq!( + is_str_bidi(c.encode_utf8(&mut buf[..])), + reference_is_char_bidi(c) + ); + } + } + + #[test] + fn test_is_utf8_bidi_thoroughly() { + let mut buf = [0; 8]; + for i in 0..0xD800u32 { + let c: char = unsafe { ::std::mem::transmute(i) }; + let expect = reference_is_char_bidi(c); + { + let len = { + let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); + assert_eq!(is_utf8_bidi(bytes), expect); + bytes.len() + }; + { + let tail = &mut buf[len..]; + for b in tail.iter_mut() { + *b = 0; + } + } + } + assert_eq!(is_utf8_bidi(&buf[..]), expect); + } + for i in 0xE000..0x110000u32 { + let c: char = unsafe { ::std::mem::transmute(i) }; + let expect = reference_is_char_bidi(c); + { + let len = { + let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); + assert_eq!(is_utf8_bidi(bytes), expect); + bytes.len() + }; + { + let tail = &mut buf[len..]; + for b in tail.iter_mut() { + *b = 0; + } + } + } + assert_eq!(is_utf8_bidi(&buf[..]), expect); + } + } + + #[test] + fn test_is_utf8_bidi_edge_cases() { + assert!(!is_utf8_bidi(b"\xD5\xBF\x61")); + assert!(!is_utf8_bidi(b"\xD6\x80\x61")); + assert!(!is_utf8_bidi(b"abc")); + assert!(is_utf8_bidi(b"\xD5\xBF\xC2")); + assert!(is_utf8_bidi(b"\xD6\x80\xC2")); + assert!(is_utf8_bidi(b"ab\xC2")); + } +} diff --git a/third_party/rust/encoding_rs/src/simd_funcs.rs b/third_party/rust/encoding_rs/src/simd_funcs.rs index 1614cdb367a3..867a3f1cf161 100644 --- a/third_party/rust/encoding_rs/src/simd_funcs.rs +++ b/third_party/rust/encoding_rs/src/simd_funcs.rs @@ -21,6 +21,7 @@ pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 { simd } +#[allow(dead_code)] #[inline(always)] pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 { *(ptr as *const u8x16) @@ -31,6 +32,7 @@ pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) { ::std::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16); } +#[allow(dead_code)] #[inline(always)] pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) { *(ptr as *mut u8x16) = s; @@ -43,6 +45,7 @@ pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 { simd } +#[allow(dead_code)] #[inline(always)] pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 { *(ptr as *const u16x8) @@ -53,6 +56,7 @@ pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) { ::std::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16); } +#[allow(dead_code)] #[inline(always)] pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) { *(ptr as *mut u16x8) = s; @@ -89,7 +93,7 @@ cfg_if! { cfg_if! { if #[cfg(target_feature = "sse2")] { #[inline(always)] - pub fn is_ascii(s: u8x16) -> bool { + pub fn simd_is_ascii(s: u8x16) -> bool { unsafe { let signed: i8x16 = ::std::mem::transmute_copy(&s); x86_mm_movemask_epi8(signed) == 0 @@ -101,16 +105,42 @@ cfg_if! { } #[inline(always)] - pub fn is_ascii(s: u8x16) -> bool { + pub fn simd_is_ascii(s: u8x16) -> bool { unsafe { aarch64_vmaxvq_u8(s) < 0x80 } } } else { #[inline(always)] - pub fn is_ascii(s: u8x16) -> bool { - let highest_ascii = u8x16::splat(0x7F); - !s.gt(highest_ascii).any() + pub fn simd_is_ascii(s: u8x16) -> bool { + let above_ascii = u8x16::splat(0x80); + s.lt(above_ascii).all() + } + } +} + +cfg_if! { + if #[cfg(target_feature = "sse2")] { + #[inline(always)] + pub fn simd_is_str_latin1(s: u8x16) -> bool { + if simd_is_ascii(s) { + return true; + } + let above_str_latin1 = u8x16::splat(0xC4); + s.lt(above_str_latin1).all() + } + } else if #[cfg(target_arch = "aarch64")]{ + #[inline(always)] + pub fn simd_is_str_latin1(s: u8x16) -> bool { + unsafe { + aarch64_vmaxvq_u8(s) < 0xC4 + } + } + } else { + #[inline(always)] + pub fn simd_is_str_latin1(s: u8x16) -> bool { + let above_str_latin1 = u8x16::splat(0xC4); + s.lt(above_str_latin1).all() } } } @@ -122,20 +152,107 @@ cfg_if! { } #[inline(always)] - pub fn is_basic_latin(s: u16x8) -> bool { + pub fn simd_is_basic_latin(s: u16x8) -> bool { unsafe { aarch64_vmaxvq_u16(s) < 0x80 } } + + #[inline(always)] + pub fn simd_is_latin1(s: u16x8) -> bool { + unsafe { + aarch64_vmaxvq_u16(s) < 0x100 + } + } } else { #[inline(always)] - pub fn is_basic_latin(s: u16x8) -> bool { - let highest_ascii = u16x8::splat(0x7F); - !s.gt(highest_ascii).any() + pub fn simd_is_basic_latin(s: u16x8) -> bool { + let above_ascii = u16x8::splat(0x80); + s.lt(above_ascii).all() + } + + #[inline(always)] + pub fn simd_is_latin1(s: u16x8) -> bool { + // For some reason, on SSE2 this formulation + // seems faster in this case while the above + // function is better the other way round... + let highest_latin1 = u16x8::splat(0xFF); + !s.gt(highest_latin1).any() } } } +#[inline(always)] +pub fn contains_surrogates(s: u16x8) -> bool { + let mask = u16x8::splat(0xF800); + let surrogate_bits = u16x8::splat(0xD800); + (s & mask).eq(surrogate_bits).any() +} + +cfg_if! { + if #[cfg(target_arch = "aarch64")]{ + macro_rules! aarch64_return_false_if_below_hebrew { + ($s:ident) => ({ + unsafe { + if aarch64_vmaxvq_u16($s) < 0x0590 { + return false; + } + } + }) + } + + macro_rules! non_aarch64_return_false_if_all { + ($s:ident) => () + } + } else { + macro_rules! aarch64_return_false_if_below_hebrew { + ($s:ident) => () + } + + macro_rules! non_aarch64_return_false_if_all { + ($s:ident) => ({ + if $s.all() { + return false; + } + }) + } + } +} + +macro_rules! in_range16x8 { + ($s:ident, $start:expr, $end:expr) => ({ + // SIMD sub is wrapping + ($s - u16x8::splat($start)).lt(u16x8::splat($end - $start)) + }) +} + +#[inline(always)] +pub fn is_u16x8_bidi(s: u16x8) -> bool { + // We try to first quickly refute the RTLness of the vector. If that + // fails, we do the real RTL check, so in that case we end up wasting + // the work for the up-front quick checks. Even the quick-check is + // two-fold in order to return `false` ASAP if everything is below + // Hebrew. + + aarch64_return_false_if_below_hebrew!(s); + + let below_hebrew = s.lt(u16x8::splat(0x0590)); + + non_aarch64_return_false_if_all!(below_hebrew); + + if (below_hebrew | in_range16x8!(s, 0x0900, 0x200F) | in_range16x8!(s, 0x2068, 0xD802)).all() { + return false; + } + + // Quick refutation failed. Let's do the full check. + + (in_range16x8!(s, 0x0590, 0x0900) | in_range16x8!(s, 0xFB50, 0xFE00) | + in_range16x8!(s, 0xFE70, 0xFF00) | in_range16x8!(s, 0xD802, 0xD804) | + in_range16x8!(s, 0xD83A, 0xD83C) | s.eq(u16x8::splat(0x200F)) | + s.eq(u16x8::splat(0x202B)) | s.eq(u16x8::splat(0x202E)) | s.eq(u16x8::splat(0x2067))) + .any() +} + #[inline(always)] pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) { unsafe { @@ -206,7 +323,7 @@ mod tests { } #[test] - fn test_is_basic_latin_success() { + fn test_simd_is_basic_latin_success() { let ascii: [u8; 16] = [0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76]; let basic_latin: [u16; 16] = [0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, @@ -216,7 +333,7 @@ mod tests { let mut vec = Vec::with_capacity(16); vec.resize(16, 0u8); let ptr = vec.as_mut_ptr(); - assert!(is_basic_latin(first | second)); + assert!(simd_is_basic_latin(first | second)); unsafe { store16_unaligned(ptr, simd_pack(first, second)); } @@ -224,46 +341,46 @@ mod tests { } #[test] - fn test_is_basic_latin_c0() { + fn test_simd_is_basic_latin_c0() { let input: [u16; 16] = [0x61, 0x62, 0x63, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76]; let first = unsafe { load8_unaligned(input.as_ptr()) }; let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) }; - assert!(!is_basic_latin(first | second)); + assert!(!simd_is_basic_latin(first | second)); } #[test] - fn test_is_basic_latin_0fff() { + fn test_simd_is_basic_latin_0fff() { let input: [u16; 16] = [0x61, 0x62, 0x63, 0x0FFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76]; let first = unsafe { load8_unaligned(input.as_ptr()) }; let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) }; - assert!(!is_basic_latin(first | second)); + assert!(!simd_is_basic_latin(first | second)); } #[test] - fn test_is_basic_latin_ffff() { + fn test_simd_is_basic_latin_ffff() { let input: [u16; 16] = [0x61, 0x62, 0x63, 0xFFFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76]; let first = unsafe { load8_unaligned(input.as_ptr()) }; let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) }; - assert!(!is_basic_latin(first | second)); + assert!(!simd_is_basic_latin(first | second)); } #[test] - fn test_is_ascii_success() { + fn test_simd_is_ascii_success() { let ascii: [u8; 16] = [0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76]; let simd = unsafe { load16_unaligned(ascii.as_ptr()) }; - assert!(is_ascii(simd)); + assert!(simd_is_ascii(simd)); } #[test] - fn test_is_ascii_failure() { + fn test_simd_is_ascii_failure() { let input: [u8; 16] = [0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76]; let simd = unsafe { load16_unaligned(input.as_ptr()) }; - assert!(!is_ascii(simd)); + assert!(!simd_is_ascii(simd)); } #[cfg(target_feature = "sse2")] diff --git a/third_party/rust/encoding_rs/src/testing.rs b/third_party/rust/encoding_rs/src/testing.rs index eeeea00efa2e..724eb60579de 100644 --- a/third_party/rust/encoding_rs/src/testing.rs +++ b/third_party/rust/encoding_rs/src/testing.rs @@ -22,13 +22,20 @@ pub fn decode(encoding: &'static Encoding, bytes: &[u8], expect: &str) { } vec.extend_from_slice(bytes); string.push_str(expect); - decode_without_padding(encoding, &vec[..], &string[..]); + decode_without_padding_impl(encoding, &vec[..], &string[..], i); } } pub fn decode_without_padding(encoding: &'static Encoding, bytes: &[u8], expect: &str) { - decode_to_utf8(encoding, bytes, expect); - decode_to_utf16(encoding, bytes, &utf16_from_utf8(expect)[..]); + decode_without_padding_impl(encoding, bytes, expect, 0); +} + +fn decode_without_padding_impl(encoding: &'static Encoding, + bytes: &[u8], + expect: &str, + padding: usize) { + decode_to_utf8_impl(encoding, bytes, expect, padding); + decode_to_utf16_impl(encoding, bytes, &utf16_from_utf8(expect)[..], padding); decode_to_string(encoding, bytes, expect); } @@ -56,40 +63,116 @@ pub fn encode_without_padding(encoding: &'static Encoding, string: &str, expect: } pub fn decode_to_utf16(encoding: &'static Encoding, bytes: &[u8], expect: &[u16]) { + decode_to_utf16_impl(encoding, bytes, expect, 0); +} + +pub fn decode_to_utf16_impl(encoding: &'static Encoding, + bytes: &[u8], + expect: &[u16], + padding: usize) { + for i in padding..bytes.len() { + let (head, tail) = bytes.split_at(i); + decode_to_utf16_with_boundary(encoding, head, tail, expect); + } +} + +pub fn decode_to_utf16_with_boundary(encoding: &'static Encoding, + head: &[u8], + tail: &[u8], + expect: &[u16]) { let mut decoder = encoding.new_decoder(); - let mut dest: Vec = - Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap()); + let mut dest: Vec = Vec::with_capacity( + decoder + .max_utf16_buffer_length(head.len() + tail.len()) + .unwrap() + ); let capacity = dest.capacity(); dest.resize(capacity, 0u16); - let (complete, read, written, _) = decoder.decode_to_utf16(bytes, &mut dest, true); - match complete { - CoderResult::InputEmpty => {} - CoderResult::OutputFull => { - unreachable!(); + let mut total_read = 0; + let mut total_written = 0; + { + let (complete, read, written, _) = decoder.decode_to_utf16(head, &mut dest, false); + match complete { + CoderResult::InputEmpty => {} + CoderResult::OutputFull => { + unreachable!(); + } } + total_read += read; + total_written += written; } - assert_eq!(read, bytes.len()); - assert_eq!(written, expect.len()); - dest.truncate(written); + { + let (complete, read, written, _) = + decoder.decode_to_utf16(tail, &mut dest[total_written..], true); + match complete { + CoderResult::InputEmpty => {} + CoderResult::OutputFull => { + unreachable!(); + } + } + total_read += read; + total_written += written; + } + assert_eq!(total_read, head.len() + tail.len()); + assert_eq!(total_written, expect.len()); + dest.truncate(total_written); assert_eq!(&dest[..], expect); } pub fn decode_to_utf8(encoding: &'static Encoding, bytes: &[u8], expect: &str) { + decode_to_utf8_impl(encoding, bytes, expect, 0); +} + +pub fn decode_to_utf8_impl(encoding: &'static Encoding, + bytes: &[u8], + expect: &str, + padding: usize) { + for i in padding..bytes.len() { + let (head, tail) = bytes.split_at(i); + decode_to_utf8_with_boundary(encoding, head, tail, expect); + } +} + +pub fn decode_to_utf8_with_boundary(encoding: &'static Encoding, + head: &[u8], + tail: &[u8], + expect: &str) { let mut decoder = encoding.new_decoder(); - let mut dest: Vec = - Vec::with_capacity(decoder.max_utf8_buffer_length(bytes.len()).unwrap()); + let mut dest: Vec = Vec::with_capacity( + decoder + .max_utf8_buffer_length(head.len() + tail.len()) + .unwrap() + ); let capacity = dest.capacity(); dest.resize(capacity, 0u8); - let (complete, read, written, _) = decoder.decode_to_utf8(bytes, &mut dest, true); - match complete { - CoderResult::InputEmpty => {} - CoderResult::OutputFull => { - unreachable!(); + let mut total_read = 0; + let mut total_written = 0; + { + let (complete, read, written, _) = decoder.decode_to_utf8(head, &mut dest, false); + match complete { + CoderResult::InputEmpty => {} + CoderResult::OutputFull => { + unreachable!(); + } } + total_read += read; + total_written += written; } - assert_eq!(read, bytes.len()); - assert_eq!(written, expect.len()); - dest.truncate(written); + { + let (complete, read, written, _) = + decoder.decode_to_utf8(tail, &mut dest[total_written..], true); + match complete { + CoderResult::InputEmpty => {} + CoderResult::OutputFull => { + unreachable!(); + } + } + total_read += read; + total_written += written; + } + assert_eq!(total_read, head.len() + tail.len()); + assert_eq!(total_written, expect.len()); + dest.truncate(total_written); assert_eq!(&dest[..], expect.as_bytes()); } diff --git a/third_party/rust/encoding_rs/src/utf_8.rs b/third_party/rust/encoding_rs/src/utf_8.rs index a31587a314b8..9920c523f757 100644 --- a/third_party/rust/encoding_rs/src/utf_8.rs +++ b/third_party/rust/encoding_rs/src/utf_8.rs @@ -34,21 +34,21 @@ cfg_if! { } } -const UTF8_NORMAL_TRAIL: u8 = 1 << 3; +pub const UTF8_NORMAL_TRAIL: u8 = 1 << 3; -const UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL: u8 = 1 << 4; +pub const UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL: u8 = 1 << 4; -const UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL: u8 = 1 << 5; +pub const UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL: u8 = 1 << 5; -const UTF8_FOUR_BYTE_SPECIAL_LOWER_BOUND_TRAIL: u8 = 1 << 6; +pub const UTF8_FOUR_BYTE_SPECIAL_LOWER_BOUND_TRAIL: u8 = 1 << 6; -const UTF8_FOUR_BYTE_SPECIAL_UPPER_BOUND_TRAIL: u8 = 1 << 7; +pub const UTF8_FOUR_BYTE_SPECIAL_UPPER_BOUND_TRAIL: u8 = 1 << 7; // BEGIN GENERATED CODE. PLEASE DO NOT EDIT. // Instead, please regenerate using generate-encoding-data.py /// Bit is 1 if the trail is invalid. -static UTF8_TRAIL_INVALID: [u8; 256] = +pub static UTF8_TRAIL_INVALID: [u8; 256] = [248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, @@ -433,16 +433,18 @@ pub struct Utf8Decoder { } impl Utf8Decoder { + pub fn new_inner() -> Utf8Decoder { + Utf8Decoder { + code_point: 0, + bytes_seen: 0, + bytes_needed: 0, + lower_boundary: 0x80u8, + upper_boundary: 0xBFu8, + } + } + pub fn new() -> VariantDecoder { - VariantDecoder::Utf8( - Utf8Decoder { - code_point: 0, - bytes_seen: 0, - bytes_needed: 0, - lower_boundary: 0x80u8, - upper_boundary: 0xBFu8, - } - ) + VariantDecoder::Utf8(Utf8Decoder::new_inner()) } fn extra_from_state(&self) -> usize { diff --git a/third_party/rust/simd/.cargo-checksum.json b/third_party/rust/simd/.cargo-checksum.json index 65f31b0560e6..470289a0355c 100644 --- a/third_party/rust/simd/.cargo-checksum.json +++ b/third_party/rust/simd/.cargo-checksum.json @@ -1 +1 @@ -{"files":{".travis.yml":"e2c720c3633b7671efce49147c62b12bcbf630d7c5d6fc65cd97620bfa4ddcea","Cargo.toml":"608aad04f17a524ee21048fa2ce9f656ae344e0473dd0e331dc954f0f9677c63","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6d3a9431e65e69c73a8923e6517b889d17549b23db406b9ec027710d16af701f","README.md":"249294a9a5f63c64c0f7fe4a607060f43f3507dce2378067aa59d25fb3ae681d","benches/mandelbrot.rs":"051b5199e66bca6cf7774e9024915fd4e1349ab39726a10a14e06b60d65d87a4","benches/matrix.rs":"048a21dacdb62365e0105d00d2c8cd6bd2396ac81134f2bff7eb4f7d095fb735","examples/axpy.rs":"4307626045d64ec08361c97c9c72c5dc8d361bdc88f64453b97ac0212041a1b2","examples/convert.rs":"8e658fde050f8a0d8b84ad7570446b10fcf544afbd551b940ca340474f324840","examples/dot-product.rs":"6fe2e007c147af5353804173a593c5b9d57dbccec156e1da37e9e32537363f91","examples/fannkuch-redux-nosimd.rs":"7b2fbde35e8666929d14d67328471cb0483d038a5325232f8db148b30865312b","examples/fannkuch-redux.rs":"ea21fdbd2274488a62cc984acad6e0b65d52f24fb4ff63b7057a3a667e9c8aae","examples/mandelbrot.rs":"8b8fdca1edac50e5a33e0e0592bd41eb75114f31839ccd40d485c61a9a664380","examples/matrix-inverse.rs":"a378d20ef20c2119bb10a86de27c92fec2c2f77f374e6bfd36707c9825a5fe92","examples/nbody-nosimd.rs":"2c8e0a7feacd202fdd65eeceb6420d6e9f43340b81f20a8e532704a587a2796b","examples/nbody.rs":"a864311affab262024479d6348ff51af43d809e9ad332ec30ea4aacceaa2eae1","examples/ops.rs":"1316f915d0afcfa98fdc4077e965ccccf6b4b21c433cbe487ff0cdc60df3cd39","examples/spectral-norm-nosimd.rs":"ffc8512ecde779078ea467f38f423a0ea623c63da7078193f9dd370200773f79","examples/spectral-norm.rs":"edb09c9d477f83939098cfb77a27cc298bc7a0c8a8e29cece0cccae0d70d890e","src/aarch64/mod.rs":"83f52775364c98de0cecb7e1509530c18972e932469f5f1522aa24a735d0fa37","src/aarch64/neon.rs":"1fe769979e07d8e2bc3c78ce116e05d735860744efe097a894cc9421153257fb","src/arm/mod.rs":"dcdd90bc0b39abaf86a0c8946d442b16313563fbae1ff03248628275c74d8617","src/arm/neon.rs":"51cc509856200e80f8e4cc2c982586e6d1cef593ec4537e153dce0cfe31d3428","src/common.rs":"62f4e7e0fefb52ad190d0f2191bc435ac4deab3f2bc70dc427f2a7f9ccb7856e","src/lib.rs":"25f0b39c038fa85af858318135dfd87865be26c33bb4bd1438aec96a1e68d8b5","src/sixty_four.rs":"510a9e00189a61e4f0a5beb7052d5dee37fc8261f94a2af45ef10327e0f3b7df","src/v256.rs":"2e328e49034876d535e0627c7a62191da2b4fb156a657614bf531a5fc75b1385","src/x86/avx.rs":"c66140abefca634b48eae307c3ec8cf5a40f2279b10e246a7e2ac602a2a2bb28","src/x86/avx2.rs":"efe3006b13a13261a3dec3d37dc1d8cb53950f3803c420069231803374949937","src/x86/mod.rs":"0acc5a5e2672e2a0fddc11065663be8b8fa2da87320ea291fa86ff8c2f33edf5","src/x86/sse2.rs":"5ceda75a401958a135fc9d851b22075314cdeed69fd483b6a7be4f11373f40da","src/x86/sse3.rs":"9bd01a4f08069ca4f445952e744d651efe887e3835b18872e757375f0d053bd2","src/x86/sse4_1.rs":"9ceb80dd70a7e7dfeef508cb935e1a2637175bc87a3b090f5dea691ff6aa0516","src/x86/sse4_2.rs":"c59321aed8decdce4d0d8570cff46aed02e1a8265647ef7702e9b180fc581254","src/x86/ssse3.rs":"2290f0269bae316b8e0491495645ee38a9bd73525c8572759c1328341c3bdb4c"},"package":"7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"} \ No newline at end of file +{"files":{".travis.yml":"e2c720c3633b7671efce49147c62b12bcbf630d7c5d6fc65cd97620bfa4ddcea","Cargo.toml":"27c6a208f0c6253c4580508311d49bb421944abd272a7f9a5a38b51ef657aec2","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6d3a9431e65e69c73a8923e6517b889d17549b23db406b9ec027710d16af701f","README.md":"249294a9a5f63c64c0f7fe4a607060f43f3507dce2378067aa59d25fb3ae681d","benches/mandelbrot.rs":"051b5199e66bca6cf7774e9024915fd4e1349ab39726a10a14e06b60d65d87a4","benches/matrix.rs":"048a21dacdb62365e0105d00d2c8cd6bd2396ac81134f2bff7eb4f7d095fb735","examples/axpy.rs":"4307626045d64ec08361c97c9c72c5dc8d361bdc88f64453b97ac0212041a1b2","examples/convert.rs":"8e658fde050f8a0d8b84ad7570446b10fcf544afbd551b940ca340474f324840","examples/dot-product.rs":"6fe2e007c147af5353804173a593c5b9d57dbccec156e1da37e9e32537363f91","examples/fannkuch-redux-nosimd.rs":"7b2fbde35e8666929d14d67328471cb0483d038a5325232f8db148b30865312b","examples/fannkuch-redux.rs":"ea21fdbd2274488a62cc984acad6e0b65d52f24fb4ff63b7057a3a667e9c8aae","examples/mandelbrot.rs":"71be242543c1e487145d7f16341c05d05d86109de4d9e94c5d6bc9a9c6ed9766","examples/matrix-inverse.rs":"93dbc55c66a72e5f7bc730072f35682523fa20dd362755d8443ad6982143cb5d","examples/nbody-nosimd.rs":"9cf46ea02e266c20f811318f1c5856d5afb9575b2d48d552fbd978f5c1856bdb","examples/nbody.rs":"a864311affab262024479d6348ff51af43d809e9ad332ec30ea4aacceaa2eae1","examples/ops.rs":"b08ea83583df71d0052895d677320a9888da5b6729c9b70636d31ede5128bb7f","examples/spectral-norm-nosimd.rs":"ffc8512ecde779078ea467f38f423a0ea623c63da7078193f9dd370200773f79","examples/spectral-norm.rs":"edb09c9d477f83939098cfb77a27cc298bc7a0c8a8e29cece0cccae0d70d890e","src/aarch64/mod.rs":"83f52775364c98de0cecb7e1509530c18972e932469f5f1522aa24a735d0fa37","src/aarch64/neon.rs":"3c05ea43b7261b9af9c0d904b37de01c2ba99caedcb464700f16617b672965a1","src/arm/mod.rs":"dcdd90bc0b39abaf86a0c8946d442b16313563fbae1ff03248628275c74d8617","src/arm/neon.rs":"00aed2c94455b7ff5755b7598fb166a94c7242ad9adf4e5379560ab04af560e7","src/common.rs":"c5a7b937c5cd8c3bccf0fb20d5d77770c0d9b0dd9fa06a661c6f2ddf118e65c0","src/lib.rs":"08c345b6a2ad641daa3c1a40b1dcc6e4f9047939414bd81b05051fc74a563fec","src/sixty_four.rs":"d168776d02acf943bda8044b24e644b7a9584197a223eba1a7c3024b205dc87d","src/v256.rs":"34bfde3676e23f6925db5d0408ae838e3aab7706128fd7c33e855b8579c69318","src/x86/avx.rs":"efcf2120a904a89b0adf2d3d3bdd0ca17df2ec058410af23fb7e81915873f808","src/x86/avx2.rs":"3bcb3f391ad5f16f0a6da0bc1301329beb478ad6265bd3b2c9c124fc2e6198e5","src/x86/mod.rs":"0acc5a5e2672e2a0fddc11065663be8b8fa2da87320ea291fa86ff8c2f33edf5","src/x86/sse2.rs":"8807fb04bbfb404e17fcacf1e21d22616f8b377540a227b1fd03c121879122dd","src/x86/sse3.rs":"9bd01a4f08069ca4f445952e744d651efe887e3835b18872e757375f0d053bd2","src/x86/sse4_1.rs":"9ceb80dd70a7e7dfeef508cb935e1a2637175bc87a3b090f5dea691ff6aa0516","src/x86/sse4_2.rs":"c59321aed8decdce4d0d8570cff46aed02e1a8265647ef7702e9b180fc581254","src/x86/ssse3.rs":"2290f0269bae316b8e0491495645ee38a9bd73525c8572759c1328341c3bdb4c"},"package":"3dd0805c7363ab51a829a1511ad24b6ed0349feaa756c4bc2f977f9f496e6673"} \ No newline at end of file diff --git a/third_party/rust/simd/Cargo.toml b/third_party/rust/simd/Cargo.toml index 769d509eb3ad..31e1f908cc49 100644 --- a/third_party/rust/simd/Cargo.toml +++ b/third_party/rust/simd/Cargo.toml @@ -1,25 +1,36 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g. crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + [package] name = "simd" -version = "0.2.0" +version = "0.2.1" authors = ["Huon Wilson "] - -repository = "https://github.com/rust-lang-nursery/simd" +description = "`simd` offers limited cross-platform access to SIMD instructions on\nCPUs, as well as raw interfaces to platform-specific instructions.\n" documentation = "https://rust-lang-nursery.github.io/simd/doc/simd" -license = "MIT/Apache-2.0" -keywords = ["simd", "data-parallel"] readme = "README.md" +keywords = ["simd", "data-parallel"] +license = "MIT/Apache-2.0" +repository = "https://github.com/rust-lang-nursery/simd" +[package.metadata.docs.rs] +features = ["doc"] +[dependencies.serde] +version = "1.0" +optional = true -description = """ -`simd` offers limited cross-platform access to SIMD instructions on -CPUs, as well as raw interfaces to platform-specific instructions. -""" - -[dependencies] -serde = { version = "0.8", optional = true } -serde_derive = { version = "0.8", optional = true } - -[dev-dependencies] -cfg-if = "0.1" +[dependencies.serde_derive] +version = "1.0" +optional = true +[dev-dependencies.cfg-if] +version = "0.1" [features] doc = [] diff --git a/third_party/rust/simd/examples/mandelbrot.rs b/third_party/rust/simd/examples/mandelbrot.rs index 69a5214bbcb7..c6f1320a0784 100755 --- a/third_party/rust/simd/examples/mandelbrot.rs +++ b/third_party/rust/simd/examples/mandelbrot.rs @@ -1,4 +1,4 @@ -#![feature(step_by, test)] +#![feature(iterator_step_by, test)] extern crate test; extern crate simd; diff --git a/third_party/rust/simd/examples/matrix-inverse.rs b/third_party/rust/simd/examples/matrix-inverse.rs index c366f3c02ddc..e6eb7ffc4655 100644 --- a/third_party/rust/simd/examples/matrix-inverse.rs +++ b/third_party/rust/simd/examples/matrix-inverse.rs @@ -25,6 +25,7 @@ fn mul(x: &[f32x4; 4], y: &[f32x4; 4]) -> [f32x4; 4] { ] } +#[allow(dead_code)] fn inverse_naive(x: &[[f32; 4]; 4]) -> [[f32; 4]; 4] { let mut t = [[0_f32; 4]; 4]; for i in 0..4 { diff --git a/third_party/rust/simd/examples/nbody-nosimd.rs b/third_party/rust/simd/examples/nbody-nosimd.rs index bafda399e754..d5f1bb422ff2 100644 --- a/third_party/rust/simd/examples/nbody-nosimd.rs +++ b/third_party/rust/simd/examples/nbody-nosimd.rs @@ -66,7 +66,7 @@ struct Planet { } fn advance(bodies: &mut [Planet;N_BODIES], dt: f64, steps: i32) { - for _ in (0..steps) { + for _ in 0..steps { let mut b_slice: &mut [_] = bodies; loop { let bi = match shift_mut_ref(&mut b_slice) { diff --git a/third_party/rust/simd/examples/ops.rs b/third_party/rust/simd/examples/ops.rs index 0e0ddcfe6d2f..f8c919101e3c 100644 --- a/third_party/rust/simd/examples/ops.rs +++ b/third_party/rust/simd/examples/ops.rs @@ -2,6 +2,7 @@ extern crate simd; use simd::*; +#[allow(unused_variables)] fn main() { let x = i32x4::splat(1_i32); let y = -x; diff --git a/third_party/rust/simd/src/aarch64/neon.rs b/third_party/rust/simd/src/aarch64/neon.rs index 50db8e1aa7fd..0cca05a52788 100644 --- a/third_party/rust/simd/src/aarch64/neon.rs +++ b/third_party/rust/simd/src/aarch64/neon.rs @@ -630,7 +630,7 @@ impl Aarch64I8x16 for i8x16 { #[doc(hidden)] pub mod common { use super::super::super::*; - use std::mem; + use core::mem; #[inline] pub fn f32x4_sqrt(x: f32x4) -> f32x4 { diff --git a/third_party/rust/simd/src/arm/neon.rs b/third_party/rust/simd/src/arm/neon.rs index b77e1211270e..e29a84040123 100644 --- a/third_party/rust/simd/src/arm/neon.rs +++ b/third_party/rust/simd/src/arm/neon.rs @@ -473,7 +473,7 @@ impl u8x8 { pub mod common { use super::super::super::*; use super::*; - use std::mem; + use core::mem; #[inline] pub fn f32x4_sqrt(x: f32x4) -> f32x4 { diff --git a/third_party/rust/simd/src/common.rs b/third_party/rust/simd/src/common.rs index 8e36b2c3a069..1052ae36959d 100644 --- a/third_party/rust/simd/src/common.rs +++ b/third_party/rust/simd/src/common.rs @@ -9,8 +9,7 @@ use super::{ Unalign, bitcast, }; -use std::mem; -use std::ops; +use core::{mem,ops}; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] diff --git a/third_party/rust/simd/src/lib.rs b/third_party/rust/simd/src/lib.rs index 82bae9045066..a5398ebe1e3d 100644 --- a/third_party/rust/simd/src/lib.rs +++ b/third_party/rust/simd/src/lib.rs @@ -1,4 +1,5 @@ //! `simd` offers a basic interface to the SIMD functionality of CPUs. +#![no_std] #![feature(cfg_target_feature, repr_simd, platform_intrinsics, const_fn)] #![allow(non_camel_case_types)] @@ -9,6 +10,8 @@ extern crate serde; #[macro_use] extern crate serde_derive; +use core::mem; + /// Boolean type for 8-bit integers. #[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))] #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] @@ -172,9 +175,9 @@ simd! { #[allow(dead_code)] #[inline] fn bitcast(x: T) -> U { - assert_eq!(std::mem::size_of::(), - std::mem::size_of::()); - unsafe {std::mem::transmute_copy(&x)} + assert_eq!(mem::size_of::(), + mem::size_of::()); + unsafe {mem::transmute_copy(&x)} } #[allow(dead_code)] @@ -207,9 +210,15 @@ extern "platform-intrinsic" { fn simd_xor(x: T, y: T) -> T; } #[repr(packed)] -#[derive(Debug, Copy, Clone)] +#[derive(Copy)] struct Unalign(T); +impl Clone for Unalign { + fn clone(&self) -> Unalign { + Unalign(unsafe { self.0.clone() }) + } +} + #[macro_use] mod common; mod sixty_four; diff --git a/third_party/rust/simd/src/sixty_four.rs b/third_party/rust/simd/src/sixty_four.rs index 0d3fd4363105..a87f44a77ee7 100644 --- a/third_party/rust/simd/src/sixty_four.rs +++ b/third_party/rust/simd/src/sixty_four.rs @@ -11,8 +11,7 @@ use super::{ Unalign, bitcast, }; -use std::mem; -use std::ops; +use core::{mem,ops}; /// Boolean type for 64-bit integers. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/third_party/rust/simd/src/v256.rs b/third_party/rust/simd/src/v256.rs index 88e486842358..519eb14e7259 100644 --- a/third_party/rust/simd/src/v256.rs +++ b/third_party/rust/simd/src/v256.rs @@ -1,6 +1,5 @@ #![allow(dead_code)] -use std::ops; -use std::mem; +use core::{mem,ops}; #[allow(unused_imports)] use super::{ Simd, @@ -329,6 +328,19 @@ impl i32x8 { } } +impl f32x8 { + /// Convert each lane to a signed integer. + #[inline] + pub fn to_i32(self) -> i32x8 { + unsafe {simd_cast(self)} + } + /// Convert each lane to an unsigned integer. + #[inline] + pub fn to_u32(self) -> u32x8 { + unsafe {simd_cast(self)} + } +} + impl i16x16 { /// Convert each lane to an unsigned integer. #[inline] diff --git a/third_party/rust/simd/src/x86/avx.rs b/third_party/rust/simd/src/x86/avx.rs index 933fa41a612b..180247e36561 100644 --- a/third_party/rust/simd/src/x86/avx.rs +++ b/third_party/rust/simd/src/x86/avx.rs @@ -54,7 +54,7 @@ extern "platform-intrinsic" { #[doc(hidden)] pub mod common { use super::*; - use std::mem; + use core::mem; macro_rules! bools { ($($ty: ty, $all: ident, $any: ident, $testc: ident, $testz: ident;)*) => { diff --git a/third_party/rust/simd/src/x86/avx2.rs b/third_party/rust/simd/src/x86/avx2.rs index fa92e3b60786..e86a33d3b5bb 100644 --- a/third_party/rust/simd/src/x86/avx2.rs +++ b/third_party/rust/simd/src/x86/avx2.rs @@ -42,7 +42,7 @@ extern "platform-intrinsic" { fn x86_mm256_packus_epi32(x: i32x8, y: i32x8) -> u16x16; fn x86_mm256_permutevar8x32_epi32(x: i32x8, y: i32x8) -> i32x8; fn x86_mm256_permutevar8x32_ps(x: f32x8, y: i32x8) -> f32x8; - fn x86_mm256_sad_epu8(x: u8x32, y: u8x32) -> u8x32; + fn x86_mm256_sad_epu8(x: u8x32, y: u8x32) -> u64x4; fn x86_mm256_shuffle_epi8(x: i8x32, y: i8x32) -> i8x32; fn x86_mm256_sign_epi8(x: i8x32, y: i8x32) -> i8x32; fn x86_mm256_sign_epi16(x: i16x16, y: i16x16) -> i16x16; diff --git a/third_party/rust/simd/src/x86/sse2.rs b/third_party/rust/simd/src/x86/sse2.rs index 143254379edc..5cbc853694d5 100644 --- a/third_party/rust/simd/src/x86/sse2.rs +++ b/third_party/rust/simd/src/x86/sse2.rs @@ -48,7 +48,7 @@ extern "platform-intrinsic" { #[doc(hidden)] pub mod common { use super::super::super::*; - use std::mem; + use core::mem; #[inline] pub fn f32x4_sqrt(x: f32x4) -> f32x4 { diff --git a/toolkit/library/gtest/rust/Cargo.lock b/toolkit/library/gtest/rust/Cargo.lock index 10c8a3a728c2..fb83a93b90e6 100644 --- a/toolkit/library/gtest/rust/Cargo.lock +++ b/toolkit/library/gtest/rust/Cargo.lock @@ -420,25 +420,25 @@ name = "encoding_c" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "encoding_glue" version = "0.1.0" dependencies = [ - "encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", "nserror 0.1.0", "nsstring 0.1.0", ] [[package]] name = "encoding_rs" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "cfg-if 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "simd 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -1208,7 +1208,7 @@ dependencies = [ [[package]] name = "simd" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -1659,7 +1659,7 @@ dependencies = [ "checksum dwrote 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a207eb7b40e25d1d28dc679f451d321fb6954b73ceaa47986702575865469461" "checksum either 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "18785c1ba806c258137c937e44ada9ee7e69a37e3c72077542cd2f069d78562a" "checksum encoding_c 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "93ec52324ca72f423237a413ca0e1c60654c8b3d0934fcd5fd888508dfcc4ba7" -"checksum encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "f5215aabf22b83153be3ee44dfe3f940214541b2ce13d419c55e7a115c8c51a9" +"checksum encoding_rs 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "98fd0f24d1fb71a4a6b9330c8ca04cbd4e7cc5d846b54ca74ff376bc7c9f798d" "checksum env_logger 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3ddf21e73e016298f5cb37d6ef8e8da8e39f91f9ec8b0df44b7deb16a9f8cd5b" "checksum error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff511d5dc435d703f4971bc399647c9bc38e20cb41452e3b9feb4765419ed3f3" "checksum euclid 0.16.0 (registry+https://github.com/rust-lang/crates.io-index)" = "926c639bfdff1f3063f76bb66245f6d2b691aa20fdbaabecc38b2947a13a4eba" @@ -1734,7 +1734,7 @@ dependencies = [ "checksum serde 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)" = "db99f3919e20faa51bb2996057f5031d8685019b5a06139b1ce761da671b8526" "checksum serde_derive 1.0.27 (git+https://github.com/gankro/serde?branch=deserialize_from_enums4)" = "" "checksum serde_derive_internals 0.19.0 (git+https://github.com/gankro/serde?branch=deserialize_from_enums4)" = "" -"checksum simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0" +"checksum simd 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3dd0805c7363ab51a829a1511ad24b6ed0349feaa756c4bc2f977f9f496e6673" "checksum siphasher 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2ffc669b726f2bc9a3bcff66e5e23b56ba6bf70e22a34c3d7b6d0b3450b65b84" "checksum slab 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b4fcaed89ab08ef143da37bc52adbcc04d4a69014f4c1208d6b51f0c47bc23" "checksum smallbitvec 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "79b776f00dfe01df905fa3b2eaa1659522e99e3fc4a7b1334171622205c4bdcf" diff --git a/toolkit/library/rust/Cargo.lock b/toolkit/library/rust/Cargo.lock index 81f436e0bc52..eff11f8f86e3 100644 --- a/toolkit/library/rust/Cargo.lock +++ b/toolkit/library/rust/Cargo.lock @@ -420,25 +420,25 @@ name = "encoding_c" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "encoding_glue" version = "0.1.0" dependencies = [ - "encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", "nserror 0.1.0", "nsstring 0.1.0", ] [[package]] name = "encoding_rs" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "cfg-if 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "simd 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -1196,7 +1196,7 @@ dependencies = [ [[package]] name = "simd" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -1671,7 +1671,7 @@ dependencies = [ "checksum dwrote 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a207eb7b40e25d1d28dc679f451d321fb6954b73ceaa47986702575865469461" "checksum either 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "18785c1ba806c258137c937e44ada9ee7e69a37e3c72077542cd2f069d78562a" "checksum encoding_c 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "93ec52324ca72f423237a413ca0e1c60654c8b3d0934fcd5fd888508dfcc4ba7" -"checksum encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "f5215aabf22b83153be3ee44dfe3f940214541b2ce13d419c55e7a115c8c51a9" +"checksum encoding_rs 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "98fd0f24d1fb71a4a6b9330c8ca04cbd4e7cc5d846b54ca74ff376bc7c9f798d" "checksum env_logger 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3ddf21e73e016298f5cb37d6ef8e8da8e39f91f9ec8b0df44b7deb16a9f8cd5b" "checksum error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff511d5dc435d703f4971bc399647c9bc38e20cb41452e3b9feb4765419ed3f3" "checksum euclid 0.16.0 (registry+https://github.com/rust-lang/crates.io-index)" = "926c639bfdff1f3063f76bb66245f6d2b691aa20fdbaabecc38b2947a13a4eba" @@ -1746,7 +1746,7 @@ dependencies = [ "checksum serde 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)" = "db99f3919e20faa51bb2996057f5031d8685019b5a06139b1ce761da671b8526" "checksum serde_derive 1.0.27 (git+https://github.com/gankro/serde?branch=deserialize_from_enums4)" = "" "checksum serde_derive_internals 0.19.0 (git+https://github.com/gankro/serde?branch=deserialize_from_enums4)" = "" -"checksum simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0" +"checksum simd 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3dd0805c7363ab51a829a1511ad24b6ed0349feaa756c4bc2f977f9f496e6673" "checksum siphasher 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2ffc669b726f2bc9a3bcff66e5e23b56ba6bf70e22a34c3d7b6d0b3450b65b84" "checksum slab 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b4fcaed89ab08ef143da37bc52adbcc04d4a69014f4c1208d6b51f0c47bc23" "checksum smallbitvec 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "79b776f00dfe01df905fa3b2eaa1659522e99e3fc4a7b1334171622205c4bdcf"