Bug 1261841 part 4 - Add a configuration option for enabling explicit SIMD in Rust. r=froydnj.

MozReview-Commit-ID: ICifcJ9499a
2017-05-29 17:11:18 +03:00 · 2017-05-29 17:11:18 +03:00 · 763d66dd51
--- a/config/rules.mk
+++ b/config/rules.mk
@ -963,6 +963,11 @@ else
 environment_cleaner =
 endif

+rust_unlock_unstable =
+ifdef MOZ_RUST_SIMD
+rust_unlock_unstable += RUSTC_BOOTSTRAP=1
+endif
+
 # This function is intended to be called by:
 #
 #   $(call CARGO_BUILD,EXTRA_ENV_VAR1=X EXTRA_ENV_VAR2=Y ...)
@ -971,7 +976,7 @@ endif
 #
 #   $(call CARGO_BUILD)
 define CARGO_BUILD
-env $(environment_cleaner) $(rustflags_override) \
+env $(environment_cleaner) $(rust_unlock_unstable) $(rustflags_override) \
 	CARGO_TARGET_DIR=$(CARGO_TARGET_DIR) \
 	RUSTC=$(RUSTC) \
 	MOZ_SRC=$(topsrcdir) \
--- a/third_party/rust/simd/.cargo-checksum.json
+++ b/third_party/rust/simd/.cargo-checksum.json
@ -0,0 +1 @@
+{"files":{".cargo-ok":"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",".gitignore":"695a11fe96751963cac851b3f556a8b922ae48ef73ee929cfe66164ea3db40cd",".travis.yml":"e2c720c3633b7671efce49147c62b12bcbf630d7c5d6fc65cd97620bfa4ddcea","Cargo.toml":"608aad04f17a524ee21048fa2ce9f656ae344e0473dd0e331dc954f0f9677c63","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6d3a9431e65e69c73a8923e6517b889d17549b23db406b9ec027710d16af701f","README.md":"249294a9a5f63c64c0f7fe4a607060f43f3507dce2378067aa59d25fb3ae681d","benches/mandelbrot.rs":"051b5199e66bca6cf7774e9024915fd4e1349ab39726a10a14e06b60d65d87a4","benches/matrix.rs":"048a21dacdb62365e0105d00d2c8cd6bd2396ac81134f2bff7eb4f7d095fb735","examples/axpy.rs":"4307626045d64ec08361c97c9c72c5dc8d361bdc88f64453b97ac0212041a1b2","examples/convert.rs":"8e658fde050f8a0d8b84ad7570446b10fcf544afbd551b940ca340474f324840","examples/dot-product.rs":"6fe2e007c147af5353804173a593c5b9d57dbccec156e1da37e9e32537363f91","examples/fannkuch-redux-nosimd.rs":"7b2fbde35e8666929d14d67328471cb0483d038a5325232f8db148b30865312b","examples/fannkuch-redux.rs":"ea21fdbd2274488a62cc984acad6e0b65d52f24fb4ff63b7057a3a667e9c8aae","examples/mandelbrot.rs":"8b8fdca1edac50e5a33e0e0592bd41eb75114f31839ccd40d485c61a9a664380","examples/matrix-inverse.rs":"a378d20ef20c2119bb10a86de27c92fec2c2f77f374e6bfd36707c9825a5fe92","examples/nbody-nosimd.rs":"2c8e0a7feacd202fdd65eeceb6420d6e9f43340b81f20a8e532704a587a2796b","examples/nbody.rs":"a864311affab262024479d6348ff51af43d809e9ad332ec30ea4aacceaa2eae1","examples/ops.rs":"1316f915d0afcfa98fdc4077e965ccccf6b4b21c433cbe487ff0cdc60df3cd39","examples/spectral-norm-nosimd.rs":"ffc8512ecde779078ea467f38f423a0ea623c63da7078193f9dd370200773f79","examples/spectral-norm.rs":"edb09c9d477f83939098cfb77a27cc298bc7a0c8a8e29cece0cccae0d70d890e","src/aarch64/mod.rs":"83f52775364c98de0cecb7e1509530c18972e932469f5f1522aa24a735d0fa37","src/aarch64/neon.rs":"1fe769979e07d8e2bc3c78ce116e05d735860744efe097a894cc9421153257fb","src/arm/mod.rs":"dcdd90bc0b39abaf86a0c8946d442b16313563fbae1ff03248628275c74d8617","src/arm/neon.rs":"51cc509856200e80f8e4cc2c982586e6d1cef593ec4537e153dce0cfe31d3428","src/common.rs":"62f4e7e0fefb52ad190d0f2191bc435ac4deab3f2bc70dc427f2a7f9ccb7856e","src/lib.rs":"25f0b39c038fa85af858318135dfd87865be26c33bb4bd1438aec96a1e68d8b5","src/sixty_four.rs":"510a9e00189a61e4f0a5beb7052d5dee37fc8261f94a2af45ef10327e0f3b7df","src/v256.rs":"2e328e49034876d535e0627c7a62191da2b4fb156a657614bf531a5fc75b1385","src/x86/avx.rs":"c66140abefca634b48eae307c3ec8cf5a40f2279b10e246a7e2ac602a2a2bb28","src/x86/avx2.rs":"efe3006b13a13261a3dec3d37dc1d8cb53950f3803c420069231803374949937","src/x86/mod.rs":"0acc5a5e2672e2a0fddc11065663be8b8fa2da87320ea291fa86ff8c2f33edf5","src/x86/sse2.rs":"5ceda75a401958a135fc9d851b22075314cdeed69fd483b6a7be4f11373f40da","src/x86/sse3.rs":"9bd01a4f08069ca4f445952e744d651efe887e3835b18872e757375f0d053bd2","src/x86/sse4_1.rs":"9ceb80dd70a7e7dfeef508cb935e1a2637175bc87a3b090f5dea691ff6aa0516","src/x86/sse4_2.rs":"c59321aed8decdce4d0d8570cff46aed02e1a8265647ef7702e9b180fc581254","src/x86/ssse3.rs":"2290f0269bae316b8e0491495645ee38a9bd73525c8572759c1328341c3bdb4c"},"package":"7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"}
--- a/third_party/rust/simd/.cargo-ok
+++ b/third_party/rust/simd/.cargo-ok
--- a/third_party/rust/simd/.gitignore
+++ b/third_party/rust/simd/.gitignore
@ -0,0 +1,3 @@
+perf.data*
+/target
+Cargo.lock
--- a/third_party/rust/simd/.travis.yml
+++ b/third_party/rust/simd/.travis.yml
@ -0,0 +1,24 @@
+language: rust
+sudo: false
+
+rust:
+  - nightly
+
+# load travis-cargo
+before_script:
+  - |
+      pip install 'travis-cargo<0.2' --user &&
+      export PATH=$HOME/.local/bin:$PATH
+
+# the main build
+script:
+  - |
+      travis-cargo build &&
+      travis-cargo test &&
+      travis-cargo bench &&
+      travis-cargo doc -- --features doc
+
+env:
+  global:
+    # override the default `--features unstable` used for the nightly branch (optional)
+    - TRAVIS_CARGO_NIGHTLY_FEATURE="with-serde"
--- a/third_party/rust/simd/Cargo.toml
+++ b/third_party/rust/simd/Cargo.toml
@ -0,0 +1,26 @@
+[package]
+name = "simd"
+version = "0.2.0"
+authors = ["Huon Wilson <dbau.pp+github@gmail.com>"]
+
+repository = "https://github.com/rust-lang-nursery/simd"
+documentation = "https://rust-lang-nursery.github.io/simd/doc/simd"
+license = "MIT/Apache-2.0"
+keywords = ["simd", "data-parallel"]
+readme = "README.md"
+
+description = """
+`simd` offers limited cross-platform access to SIMD instructions on
+CPUs, as well as raw interfaces to platform-specific instructions.
+"""
+
+[dependencies]
+serde = { version = "0.8", optional = true }
+serde_derive = { version = "0.8", optional = true }
+
+[dev-dependencies]
+cfg-if = "0.1"
+
+[features]
+doc = []
+with-serde = ["serde", "serde_derive"]
--- a/third_party/rust/simd/LICENSE-APACHE
+++ b/third_party/rust/simd/LICENSE-APACHE
@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
--- a/third_party/rust/simd/LICENSE-MIT
+++ b/third_party/rust/simd/LICENSE-MIT
@ -0,0 +1,25 @@
+Copyright (c) 2014 Huon Wilson
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/third_party/rust/simd/README.md
+++ b/third_party/rust/simd/README.md
@ -0,0 +1,7 @@
+# `simd`
+
+[![Build Status](https://travis-ci.org/rust-lang-nursery/simd.png)](https://travis-ci.org/rust-lang-nursery/simd)
+
+`simd` offers a basic interface to the SIMD functionality of CPUs.
+
+[Documentation](https://docs.rs/simd)
--- a/third_party/rust/simd/benches/mandelbrot.rs
+++ b/third_party/rust/simd/benches/mandelbrot.rs
@ -0,0 +1,117 @@
+#![feature(test)]
+#![feature(cfg_target_feature)]
+
+extern crate simd;
+extern crate test;
+
+use test::black_box as bb;
+use test::Bencher as B;
+use simd::{f32x4, u32x4};
+#[cfg(any(target_feature = "avx", target_feature = "avx2"))]
+use simd::x86::avx::{f32x8, u32x8};
+
+fn naive(c_x: f32, c_y: f32, max_iter: u32) -> u32 {
+    let mut x = c_x;
+    let mut y = c_y;
+    let mut count = 0;
+    while count < max_iter {
+        let xy = x * y;
+        let xx = x * x;
+        let yy = y * y;
+        let sum = xx + yy;
+        if sum > 4.0 {
+            break
+        }
+        count += 1;
+        x = xx - yy + c_x;
+        y = xy * 2.0 + c_y;
+    }
+    count
+}
+
+fn simd4(c_x: f32x4, c_y: f32x4, max_iter: u32) -> u32x4 {
+    let mut x = c_x;
+    let mut y = c_y;
+
+    let mut count = u32x4::splat(0);
+    for _ in 0..max_iter as usize {
+        let xy = x * y;
+        let xx = x * x;
+        let yy = y * y;
+        let sum = xx + yy;
+        let mask = sum.lt(f32x4::splat(4.0));
+
+        if !mask.any() { break }
+        count = count + mask.to_i().select(u32x4::splat(1), u32x4::splat(0));
+
+        x = xx - yy + c_x;
+        y = xy + xy + c_y;
+    }
+    count
+}
+
+#[cfg(target_feature = "avx")]
+fn simd8(c_x: f32x8, c_y: f32x8, max_iter: u32) -> u32x8 {
+    let mut x = c_x;
+    let mut y = c_y;
+
+    let mut count = u32x8::splat(0);
+    for _ in 0..max_iter as usize {
+        let xy = x * y;
+        let xx = x * x;
+        let yy = y * y;
+        let sum = xx + yy;
+        let mask = sum.lt(f32x8::splat(4.0));
+
+        if !mask.any() { break }
+        count = count + mask.to_i().select(u32x8::splat(1), u32x8::splat(0));
+
+        x = xx - yy + c_x;
+        y = xy + xy + c_y;
+    }
+    count
+}
+
+const SCALE: f32 = 3.0 / 100.0;
+const N: u32 = 100;
+#[bench]
+fn mandel_naive(b: &mut B) {
+    b.iter(|| {
+        for j in 0..100 {
+            let y = -1.5 + (j as f32) * SCALE;
+            for i in 0..100 {
+                let x = -2.2 + (i as f32) * SCALE;
+                bb(naive(x, y, N));
+            }
+        }
+    })
+}
+#[bench]
+fn mandel_simd4(b: &mut B) {
+    let tweak = u32x4::new(0, 1, 2, 3);
+    b.iter(|| {
+        for j in 0..100 {
+            let y = f32x4::splat(-1.5) + f32x4::splat(SCALE) * u32x4::splat(j).to_f32();
+            for i in 0..25 {
+                let i = u32x4::splat(i * 4) + tweak;
+                let x = f32x4::splat(-2.2) + f32x4::splat(SCALE) * i.to_f32();
+                bb(simd4(x, y, N));
+            }
+        }
+    })
+}
+#[cfg(any(target_feature = "avx", target_feature = "avx2"))]
+#[bench]
+fn mandel_simd8(b: &mut B) {
+    let tweak = u32x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    b.iter(|| {
+        for j in 0..100 {
+            let y = f32x8::splat(-1.5) + f32x8::splat(SCALE) * u32x8::splat(j).to_f32();
+            for i in 0..13 { // 100 not divisible by 8 :(
+                let i = u32x8::splat(i * 8) + tweak;
+                let x = f32x8::splat(-2.2) + f32x8::splat(SCALE) * i.to_f32();
+                bb(simd8(x, y, N));
+            }
+        }
+    })
+}
--- a/third_party/rust/simd/benches/matrix.rs
+++ b/third_party/rust/simd/benches/matrix.rs
@ -0,0 +1,485 @@
+#![feature(test)]
+#![feature(cfg_target_feature)]
+extern crate test;
+extern crate simd;
+
+use test::black_box as bb;
+use test::Bencher as B;
+use simd::f32x4;
+#[cfg(target_feature = "avx")]
+use simd::x86::avx::{f32x8, f64x4};
+// #[cfg(target_feature = "avx2")]
+// use simd::x86::avx2::Avx2F32x8;
+
+
+#[bench]
+fn multiply_naive(b: &mut B) {
+    let x = [[1.0_f32; 4]; 4];
+    let y = [[2.0; 4]; 4];
+    b.iter(|| {
+        for _ in 0..100 {
+        let (x, y) = bb((&x, &y));
+
+        bb(&[[x[0][0] * y[0][0] + x[1][0] * y[0][1] + x[2][0] * y[0][2] + x[3][0] * y[0][3],
+            x[0][1] * y[0][0] + x[1][1] * y[0][1] + x[2][1] * y[0][2] + x[3][1] * y[0][3],
+            x[0][2] * y[0][0] + x[1][2] * y[0][1] + x[2][2] * y[0][2] + x[3][2] * y[0][3],
+            x[0][3] * y[0][0] + x[1][3] * y[0][1] + x[2][3] * y[0][2] + x[3][3] * y[0][3]],
+           [x[0][0] * y[1][0] + x[1][0] * y[1][1] + x[2][0] * y[1][2] + x[3][0] * y[1][3],
+            x[0][1] * y[1][0] + x[1][1] * y[1][1] + x[2][1] * y[1][2] + x[3][1] * y[1][3],
+            x[0][2] * y[1][0] + x[1][2] * y[1][1] + x[2][2] * y[1][2] + x[3][2] * y[1][3],
+            x[0][3] * y[1][0] + x[1][3] * y[1][1] + x[2][3] * y[1][2] + x[3][3] * y[1][3]],
+           [x[0][0] * y[2][0] + x[1][0] * y[2][1] + x[2][0] * y[2][2] + x[3][0] * y[2][3],
+            x[0][1] * y[2][0] + x[1][1] * y[2][1] + x[2][1] * y[2][2] + x[3][1] * y[2][3],
+            x[0][2] * y[2][0] + x[1][2] * y[2][1] + x[2][2] * y[2][2] + x[3][2] * y[2][3],
+            x[0][3] * y[2][0] + x[1][3] * y[2][1] + x[2][3] * y[2][2] + x[3][3] * y[2][3]],
+           [x[0][0] * y[3][0] + x[1][0] * y[3][1] + x[2][0] * y[3][2] + x[3][0] * y[3][3],
+            x[0][1] * y[3][0] + x[1][1] * y[3][1] + x[2][1] * y[3][2] + x[3][1] * y[3][3],
+            x[0][2] * y[3][0] + x[1][2] * y[3][1] + x[2][2] * y[3][2] + x[3][2] * y[3][3],
+            x[0][3] * y[3][0] + x[1][3] * y[3][1] + x[2][3] * y[3][2] + x[3][3] * y[3][3]],
+             ]);
+        }
+    })
+}
+
+#[bench]
+fn multiply_simd4_32(b: &mut B) {
+    let x = [f32x4::splat(1.0_f32); 4];
+    let y = [f32x4::splat(2.0); 4];
+    b.iter(|| {
+        for _ in 0..100 {
+        let (x, y) = bb((&x, &y));
+
+        let y0 = y[0];
+        let y1 = y[1];
+        let y2 = y[2];
+        let y3 = y[3];
+        bb(&[f32x4::splat(y0.extract(0)) * x[0] +
+             f32x4::splat(y0.extract(1)) * x[1] +
+             f32x4::splat(y0.extract(2)) * x[2] +
+             f32x4::splat(y0.extract(3)) * x[3],
+             f32x4::splat(y1.extract(0)) * x[0] +
+             f32x4::splat(y1.extract(1)) * x[1] +
+             f32x4::splat(y1.extract(2)) * x[2] +
+             f32x4::splat(y1.extract(3)) * x[3],
+             f32x4::splat(y2.extract(0)) * x[0] +
+             f32x4::splat(y2.extract(1)) * x[1] +
+             f32x4::splat(y2.extract(2)) * x[2] +
+             f32x4::splat(y2.extract(3)) * x[3],
+             f32x4::splat(y3.extract(0)) * x[0] +
+             f32x4::splat(y3.extract(1)) * x[1] +
+             f32x4::splat(y3.extract(2)) * x[2] +
+             f32x4::splat(y3.extract(3)) * x[3],
+             ]);
+        }
+    })
+}
+
+#[cfg(target_feature = "avx")]
+#[bench]
+fn multiply_simd4_64(b: &mut B) {
+    let x = [f64x4::splat(1.0_f64); 4];
+    let y = [f64x4::splat(2.0); 4];
+    b.iter(|| {
+        for _ in 0..100 {
+        let (x, y) = bb((&x, &y));
+
+        let y0 = y[0];
+        let y1 = y[1];
+        let y2 = y[2];
+        let y3 = y[3];
+        bb(&[f64x4::splat(y0.extract(0)) * x[0] +
+             f64x4::splat(y0.extract(1)) * x[1] +
+             f64x4::splat(y0.extract(2)) * x[2] +
+             f64x4::splat(y0.extract(3)) * x[3],
+             f64x4::splat(y1.extract(0)) * x[0] +
+             f64x4::splat(y1.extract(1)) * x[1] +
+             f64x4::splat(y1.extract(2)) * x[2] +
+             f64x4::splat(y1.extract(3)) * x[3],
+             f64x4::splat(y2.extract(0)) * x[0] +
+             f64x4::splat(y2.extract(1)) * x[1] +
+             f64x4::splat(y2.extract(2)) * x[2] +
+             f64x4::splat(y2.extract(3)) * x[3],
+             f64x4::splat(y3.extract(0)) * x[0] +
+             f64x4::splat(y3.extract(1)) * x[1] +
+             f64x4::splat(y3.extract(2)) * x[2] +
+             f64x4::splat(y3.extract(3)) * x[3],
+             ]);
+        }
+    })
+}
+
+#[bench]
+fn inverse_naive(b: &mut B) {
+    let mut x = [[0_f32; 4]; 4];
+    for i in 0..4 { x[i][i] = 1.0 }
+
+    b.iter(|| {
+        for _ in 0..100 {
+            let x = bb(&x);
+
+            let mut t = [[0_f32; 4]; 4];
+            for i in 0..4 {
+                t[0][i] = x[i][0];
+                t[1][i] = x[i][1];
+                t[2][i] = x[i][2];
+                t[3][i] = x[i][3];
+            }
+
+            let _0 = t[2][2] * t[3][3];
+            let _1 = t[2][3] * t[3][2];
+            let _2 = t[2][1] * t[3][3];
+            let _3 = t[2][3] * t[3][1];
+            let _4 = t[2][1] * t[3][2];
+            let _5 = t[2][2] * t[3][1];
+            let _6 = t[2][0] * t[3][3];
+            let _7 = t[2][3] * t[3][0];
+            let _8 = t[2][0] * t[3][2];
+            let _9 = t[2][2] * t[3][0];
+            let _10 = t[2][0] * t[3][1];
+            let _11 = t[2][1] * t[3][0];
+
+            let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
+                (_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
+            let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
+                (_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
+            let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
+                (_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
+            let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
+                (_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
+            let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
+                (_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
+            let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
+                (_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
+            let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
+                (_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
+            let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
+                (_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);
+
+            let _0 = t[0][2] * t[1][3];
+            let _1 = t[0][3] * t[1][2];
+            let _2 = t[0][1] * t[1][3];
+            let _3 = t[0][3] * t[1][1];
+            let _4 = t[0][1] * t[1][2];
+            let _5 = t[0][2] * t[1][1];
+            let _6 = t[0][0] * t[1][3];
+            let _7 = t[0][3] * t[1][0];
+            let _8 = t[0][0] * t[1][2];
+            let _9 = t[0][2] * t[1][0];
+            let _10 = t[0][0] * t[1][1];
+            let _11 = t[0][1] * t[1][0];
+
+            let d20  = _0*t[3][1]  + _3*t[3][2]  + _4*t[3][3]-
+                (_1*t[3][1]  + _2*t[3][2]  + _5*t[3][3]);
+            let d21  = _1*t[3][0]  + _6*t[3][2]  + _9*t[3][3]-
+                (_0*t[3][0]  + _7*t[3][2]  + _8*t[3][3]);
+            let d22 = _2*t[3][0]  + _7*t[3][1]  + _10*t[3][3]-
+                (_3*t[3][0]  + _6*t[3][1]  + _11*t[3][3]);
+            let d23 = _5*t[3][0]  + _8*t[3][1]  + _11*t[3][2]-
+                (_4*t[3][0]  + _9*t[3][1]  + _10*t[3][2]);
+            let d30 = _2*t[2][2]  + _5*t[2][3]  + _1*t[2][1]-
+                (_4*t[2][3]  + _0*t[2][1]   + _3*t[2][2]);
+            let d31 = _8*t[2][3]  + _0*t[2][0]   + _7*t[2][2]-
+                (_6*t[2][2]  + _9*t[2][3]  + _1*t[2][0]);
+            let d32 = _6*t[2][1]   + _11*t[2][3] + _3*t[2][0]-
+                (_10*t[2][3] + _2*t[2][0]   + _7*t[2][1]);
+            let d33 = _10*t[2][2] + _4*t[2][0]   + _9*t[2][1]-
+                (_8*t[2][1]   + _11*t[2][2] + _5*t[2][0]);
+
+            let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;
+
+            let det = 1.0 / det;
+            let mut ret = [[d00, d01, d02, d03],
+                           [d10, d11, d12, d13],
+                           [d20, d21, d22, d23],
+                           [d30, d31, d32, d33]];
+            for i in 0..4 {
+                for j in 0..4 {
+                    ret[i][j] *= det;
+                }
+            }
+            bb(&ret);
+        }
+    })
+}
+
+#[bench]
+fn inverse_simd4(b: &mut B) {
+    let mut x = [f32x4::splat(0_f32); 4];
+    for i in 0..4 { x[i] = x[i].replace(i as u32, 1.0); }
+
+    fn shuf0145(v: f32x4, w: f32x4) -> f32x4 {
+        f32x4::new(v.extract(0), v.extract(1),
+                   w.extract(4 - 4), w.extract(5 - 4))
+    }
+    fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
+        f32x4::new(v.extract(0), v.extract(2),
+                   w.extract(4 - 4), w.extract(6 - 4))
+    }
+    fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
+        f32x4::new(v.extract(1), v.extract(3),
+                   w.extract(5 - 4), w.extract(7 - 4))
+    }
+    fn shuf2367(v: f32x4, w: f32x4) -> f32x4 {
+        f32x4::new(v.extract(2), v.extract(3),
+                   w.extract(6 - 4), w.extract(7 - 4))
+    }
+
+    fn swiz1032(v: f32x4) -> f32x4 {
+        f32x4::new(v.extract(1), v.extract(0),
+                   v.extract(3), v.extract(2))
+    }
+    fn swiz2301(v: f32x4) -> f32x4 {
+        f32x4::new(v.extract(2), v.extract(3),
+                   v.extract(0), v.extract(1))
+    }
+
+    b.iter(|| {
+        for _ in 0..100 {
+            let src0;
+            let src1;
+            let src2;
+            let src3;
+            let mut tmp1;
+            let row0;
+            let mut row1;
+            let mut row2;
+            let mut row3;
+            let mut minor0;
+            let mut minor1;
+            let mut minor2;
+            let mut minor3;
+            let mut det;
+
+            let x = bb(&x);
+            src0 = x[0];
+            src1 = x[1];
+            src2 = x[2];
+            src3 = x[3];
+
+            tmp1 = shuf0145(src0, src1);
+            row1 = shuf0145(src2, src3);
+            row0 = shuf0246(tmp1, row1);
+            row1 = shuf1357(row1, tmp1);
+
+            tmp1 = shuf2367(src0, src1);
+            row3 = shuf2367(src2, src3);
+            row2 = shuf0246(tmp1, row3);
+            row3 = shuf0246(row3, tmp1);
+
+
+            tmp1 = row2 * row3;
+            tmp1 = swiz1032(tmp1);
+            minor0 = row1 * tmp1;
+            minor1 = row0 * tmp1;
+            tmp1 = swiz2301(tmp1);
+            minor0 = (row1 * tmp1) - minor0;
+            minor1 = (row0 * tmp1) - minor1;
+            minor1 = swiz2301(minor1);
+
+
+            tmp1 = row1 * row2;
+            tmp1 = swiz1032(tmp1);
+            minor0 = (row3 * tmp1) + minor0;
+            minor3 = row0 * tmp1;
+            tmp1 = swiz2301(tmp1);
+
+            minor0 = minor0 - row3 * tmp1;
+            minor3 = row0 * tmp1 - minor3;
+            minor3 = swiz2301(minor3);
+
+
+            tmp1 = row3 * swiz2301(row1);
+            tmp1 = swiz1032(tmp1);
+            row2 = swiz2301(row2);
+            minor0 = row2 * tmp1 + minor0;
+            minor2 = row0 * tmp1;
+            tmp1 = swiz2301(tmp1);
+            minor0 = minor0 - row2 * tmp1;
+            minor2 = row0 * tmp1 - minor2;
+            minor2 = swiz2301(minor2);
+
+
+            tmp1 = row0 * row1;
+            tmp1 = swiz1032(tmp1);
+            minor2 = minor2 + row3 * tmp1;
+            minor3 = row2 * tmp1 - minor3;
+            tmp1 = swiz2301(tmp1);
+            minor2 = row3 * tmp1 - minor2;
+            minor3 = minor3 - row2 * tmp1;
+
+
+
+            tmp1 = row0 * row3;
+            tmp1 = swiz1032(tmp1);
+            minor1 = minor1 - row2 * tmp1;
+            minor2 = row1 * tmp1 + minor2;
+            tmp1 = swiz2301(tmp1);
+            minor1 = row2 * tmp1 + minor1;
+            minor2 = minor2 - row1 * tmp1;
+
+            tmp1 = row0 * row2;
+            tmp1 = swiz1032(tmp1);
+            minor1 = row3 * tmp1 + minor1;
+            minor3 = minor3 - row1 * tmp1;
+            tmp1 = swiz2301(tmp1);
+            minor1 = minor1 - row3 * tmp1;
+            minor3 = row1 * tmp1 + minor3;
+
+            det = row0 * minor0;
+            det = swiz2301(det) + det;
+            det = swiz1032(det) + det;
+            //tmp1 = det.approx_reciprocal(); det = tmp1 * (f32x4::splat(2.0) - det * tmp1);
+            det = f32x4::splat(1.0) / det;
+
+            bb(&[minor0 * det, minor1 * det, minor2 * det, minor3 * det]);
+        }
+     })
+
+}
+
+#[bench]
+fn transpose_naive(b: &mut B) {
+    let x = [[0_f32; 4]; 4];
+    b.iter(|| {
+        for _ in 0..100 {
+            let x = bb(&x);
+            bb(&[[x[0][0], x[1][0], x[2][0], x[3][0]],
+                 [x[0][1], x[1][1], x[2][1], x[3][1]],
+                 [x[0][2], x[1][2], x[2][2], x[3][2]],
+                 [x[0][3], x[1][3], x[2][3], x[3][3]]]);
+        }
+    })
+}
+
+#[bench]
+fn transpose_simd4(b: &mut B) {
+    let x = [f32x4::splat(0_f32); 4];
+
+    fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
+        f32x4::new(v.extract(0), v.extract(2),
+                   w.extract(4 - 4), w.extract(6 - 4))
+    }
+    fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
+        f32x4::new(v.extract(1), v.extract(3),
+                   w.extract(5 - 4), w.extract(7 - 4))
+    }
+    b.iter(|| {
+        for _ in 0..100 {
+            let x = bb(&x);
+            let x0 = x[0];
+            let x1 = x[1];
+            let x2 = x[2];
+            let x3 = x[3];
+
+            let a0 = shuf0246(x0, x1);
+            let a1 = shuf0246(x2, x3);
+            let a2 = shuf1357(x0, x1);
+            let a3 = shuf1357(x2, x3);
+
+            let b0 = shuf0246(a0, a1);
+            let b1 = shuf0246(a2, a3);
+            let b2 = shuf1357(a0, a1);
+            let b3 = shuf1357(a2, a3);
+            bb(&[b0, b1, b2, b3]);
+        }
+    })
+}
+
+#[cfg(target_feature = "avx")]
+#[bench]
+fn transpose_simd8_naive(b: &mut B) {
+    let x = [f32x8::splat(0_f32); 2];
+
+    fn shuf0246(v: f32x8, w: f32x8) -> f32x8 {
+        f32x8::new(v.extract(0), v.extract(2), v.extract(4), v.extract(6),
+                   w.extract(0), w.extract(2), w.extract(4), w.extract(6))
+    }
+    fn shuf1357(v: f32x8, w: f32x8) -> f32x8 {
+        f32x8::new(v.extract(1), v.extract(3), v.extract(5), v.extract(7),
+                   w.extract(1), w.extract(3), w.extract(5), w.extract(7),)
+    }
+    b.iter(|| {
+        for _ in 0..100 {
+            let x = bb(&x);
+            let x01 = x[0];
+            let x23 = x[1];
+
+            let a01 = shuf0246(x01, x23);
+            let a23 = shuf1357(x01, x23);
+
+            let b01 = shuf0246(a01, a23);
+            let b23 = shuf1357(a01, a23);
+            bb(&[b01, b23]);
+        }
+    })
+}
+
+#[cfg(target_feature = "avx")]
+#[bench]
+fn transpose_simd8_avx2_vpermps(b: &mut B) {
+    let x = [f32x8::splat(0_f32); 2];
+
+    // efficient on AVX2 using vpermps
+    fn perm04152637(v: f32x8) -> f32x8 {
+        // broken on rustc 1.7.0-nightly (1ddaf8bdf 2015-12-12)
+        // v.permutevar(i32x8::new(0, 4, 1, 5, 2, 6, 3, 7))
+        f32x8::new(v.extract(0), v.extract(4), v.extract(1), v.extract(5),
+                    v.extract(2), v.extract(6), v.extract(3), v.extract(7))
+    }
+    fn shuf_lo(v: f32x8, w: f32x8) -> f32x8 {
+        f32x8::new(v.extract(0), v.extract(1), w.extract(0), w.extract(1),
+                   v.extract(4), v.extract(5), w.extract(4), w.extract(5),)
+    }
+    fn shuf_hi(v: f32x8, w: f32x8) -> f32x8 {
+        f32x8::new(v.extract(2), v.extract(3), w.extract(2), w.extract(3),
+                   v.extract(6), v.extract(7), w.extract(6), w.extract(7),)
+    }
+    b.iter(|| {
+        for _ in 0..100 {
+            let x = bb(&x);
+            let x01 = x[0];
+            let x23 = x[1];
+
+            let a01 = perm04152637(x01);
+            let a23 = perm04152637(x23);
+
+            let b01 = shuf_lo(a01, a23);
+            let b23 = shuf_hi(a01, a23);
+            bb(&[b01, b23]);
+        }
+    })
+}
+
+#[cfg(target_feature = "avx")]
+#[bench]
+fn transpose_simd8_avx2_vpermpd(b: &mut B) {
+    let x = [f32x8::splat(0_f32); 2];
+
+    // efficient on AVX2 using vpermpd
+    fn perm01452367(v: f32x8) -> f32x8 {
+        f32x8::new(v.extract(0), v.extract(1), v.extract(4), v.extract(5),
+                    v.extract(2), v.extract(3), v.extract(6), v.extract(7))
+    }
+    fn shuf_lo_ps(v: f32x8, w: f32x8) -> f32x8 {
+        f32x8::new(v.extract(0), w.extract(0), v.extract(1), w.extract(1),
+                   v.extract(4), w.extract(4), v.extract(5), w.extract(5),)
+    }
+    fn shuf_hi_ps(v: f32x8, w: f32x8) -> f32x8 {
+        f32x8::new(v.extract(2), w.extract(2), v.extract(3), w.extract(3),
+                   v.extract(6), w.extract(6), v.extract(7), w.extract(7),)
+    }
+    b.iter(|| {
+        for _ in 0..100 {
+            let x = bb(&x);
+            let x01 = x[0];
+            let x23 = x[1];
+
+            let a01 = perm01452367(x01);
+            let a23 = perm01452367(x23);
+
+            let b01 = shuf_lo_ps(a01, a23);
+            let b23 = shuf_hi_ps(a01, a23);
+            bb(&[b01, b23]);
+        }
+    })
+}
--- a/third_party/rust/simd/examples/axpy.rs
+++ b/third_party/rust/simd/examples/axpy.rs
@ -0,0 +1,65 @@
+#![feature(cfg_target_feature)]
+extern crate simd;
+use simd::f32x4;
+#[cfg(target_feature = "avx")]
+use simd::x86::avx::f32x8;
+
+#[inline(never)]
+pub fn axpy(z: &mut [f32], a: f32, x: &[f32], y: &[f32]) {
+    assert_eq!(x.len(), y.len());
+    assert_eq!(x.len(), z.len());
+
+    let len = std::cmp::min(std::cmp::min(x.len(), y.len()), z.len());
+
+    let mut i = 0;
+    while i < len & !3 {
+        let x = f32x4::load(x, i);
+        let y = f32x4::load(y, i);
+        (f32x4::splat(a) * x + y).store(z, i);
+        i += 4
+    }
+}
+
+#[cfg(target_feature = "avx")]
+#[inline(never)]
+pub fn axpy8(z: &mut [f32], a: f32, x: &[f32], y: &[f32]) {
+    assert_eq!(x.len(), y.len());
+    assert_eq!(x.len(), z.len());
+
+    let len = std::cmp::min(std::cmp::min(x.len(), y.len()), z.len());
+
+    let mut i = 0;
+    while i < len & !7 {
+        let x = f32x8::load(x, i);
+        let y = f32x8::load(y, i);
+        (f32x8::splat(a) * x + y).store(z, i);
+        i += 8
+    }
+}
+
+
+#[cfg(not(target_feature = "avx"))]
+pub fn axpy8(_: &mut [f32], _: f32, _: &[f32], _: &[f32]) {
+    unimplemented!()
+}
+
+
+fn main() {
+    let mut z = vec![0.; 4];
+    axpy(&mut z, 2., &[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]);
+    println!("{:?}", z);
+    let mut z = vec![0.; 8];
+    axpy(&mut z, 3., &[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
+                       &[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]);
+    println!("{:?}", z);
+
+    if cfg!(target_feature = "avx") {
+        let mut z = vec![0.; 4];
+        axpy8(&mut z, 2., &[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]);
+        println!("{:?}", z);
+        let mut z = vec![0.; 8];
+        axpy8(&mut z, 3., &[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
+                           &[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]);
+        println!("{:?}", z);
+    }
+}
--- a/third_party/rust/simd/examples/convert.rs
+++ b/third_party/rust/simd/examples/convert.rs
@ -0,0 +1,38 @@
+extern crate simd;
+use simd::f32x4;
+
+#[inline(never)]
+pub fn convert_scalar(x: &mut [i32], y: &[f32]) {
+    assert_eq!(x.len(), y.len());
+
+    let mut i = 0;
+    while i < x.len() & !3 {
+        x[i] = y[i] as i32;
+        i += 1;
+    }
+}
+
+#[inline(never)]
+pub fn convert(x: &mut [i32], y: &[f32]) {
+    assert_eq!(x.len(), y.len());
+
+    let mut i = 0;
+    while i < x.len() & !3 {
+        let v = f32x4::load(y, i);
+        v.to_i32().store(x, i);
+        i += 4
+    }
+}
+
+fn main() {
+    let x = &mut [0; 12];
+    let y = [1.0; 12];
+    convert(x, &y);
+    convert_scalar(x, &y);
+    println!("{:?}", x);
+    let x = &mut [0; 16];
+    let y = [1.0; 16];
+    convert(x, &y);
+    convert_scalar(x, &y);
+    println!("{:?}", x);
+}
--- a/third_party/rust/simd/examples/dot-product.rs
+++ b/third_party/rust/simd/examples/dot-product.rs
@ -0,0 +1,60 @@
+#![feature(cfg_target_feature)]
+extern crate simd;
+use simd::f32x4;
+#[cfg(target_feature = "avx")]
+use simd::x86::avx::{f32x8, LowHigh128};
+
+#[inline(never)]
+pub fn dot(x: &[f32], y: &[f32]) -> f32 {
+    assert_eq!(x.len(), y.len());
+
+    let len = std::cmp::min(x.len(), y.len());
+
+    let mut sum = f32x4::splat(0.0);
+    let mut i = 0;
+    while i < len & !3 {
+        let x = f32x4::load(x, i);
+        let y = f32x4::load(y, i);
+        sum = sum + x * y;
+        i += 4
+    }
+    sum.extract(0) + sum.extract(1) + sum.extract(2) + sum.extract(3)
+}
+
+#[cfg(target_feature = "avx")]
+#[inline(never)]
+pub fn dot8(x: &[f32], y: &[f32]) -> f32 {
+    assert_eq!(x.len(), y.len());
+
+    let len = std::cmp::min(x.len(), y.len());
+
+    let mut sum = f32x8::splat(0.0);
+    let mut i = 0;
+    while i < len & !7 {
+        let x = f32x8::load(x, i);
+        let y = f32x8::load(y, i);
+        sum = sum + x * y;
+        i += 8
+    }
+    let sum = sum.low() + sum.high();
+    sum.extract(0) + sum.extract(1) + sum.extract(2) + sum.extract(3)
+}
+
+
+#[cfg(not(target_feature = "avx"))]
+pub fn dot8(_: &[f32], _: &[f32]) -> f32 {
+    unimplemented!()
+}
+
+
+fn main() {
+    println!("{}", dot(&[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]));
+    println!("{}", dot(&[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
+                       &[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]));
+
+    if cfg!(target_feature = "avx") {
+        println!("{}", dot8(&[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]));
+        println!("{}", dot8(&[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
+                           &[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]));
+    }
+}
--- a/third_party/rust/simd/examples/fannkuch-redux-nosimd.rs
+++ b/third_party/rust/simd/examples/fannkuch-redux-nosimd.rs
@ -0,0 +1,156 @@
+// The Computer Language Benchmarks Game
+// http://benchmarksgame.alioth.debian.org/
+//
+// contributed by the Rust Project Developers
+// contributed by TeXitoi
+
+use std::{cmp, mem};
+use std::thread;
+
+fn rotate(x: &mut [i32]) {
+    let mut prev = x[0];
+    for place in x.iter_mut().rev() {
+        prev = mem::replace(place, prev)
+    }
+}
+
+fn next_permutation(perm: &mut [i32], count: &mut [i32]) {
+    for i in 1..perm.len() {
+        rotate(&mut perm[.. i + 1]);
+        let count_i = &mut count[i];
+        if *count_i >= i as i32 {
+            *count_i = 0;
+        } else {
+            *count_i += 1;
+            break
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+struct P {
+    p: [i32; 16],
+}
+
+#[derive(Clone, Copy)]
+struct Perm {
+    cnt: [i32; 16],
+    fact: [u32; 16],
+    n: u32,
+    permcount: u32,
+    perm: P,
+}
+
+impl Perm {
+    fn new(n: u32) -> Perm {
+        let mut fact = [1; 16];
+        for i in 1 .. n as usize + 1 {
+            fact[i] = fact[i - 1] * i as u32;
+        }
+        Perm {
+            cnt: [0; 16],
+            fact: fact,
+            n: n,
+            permcount: 0,
+            perm: P { p: [0; 16 ] }
+        }
+    }
+
+    fn get(&mut self, mut idx: i32) -> P {
+        let mut pp = [0u8; 16];
+        self.permcount = idx as u32;
+        for (i, place) in self.perm.p.iter_mut().enumerate() {
+            *place = i as i32 + 1;
+        }
+
+        for i in (1 .. self.n as usize).rev() {
+            let d = idx / self.fact[i] as i32;
+            self.cnt[i] = d;
+            idx %= self.fact[i] as i32;
+            for (place, val) in pp.iter_mut().zip(self.perm.p[..(i+1)].iter()) {
+                *place = (*val) as u8
+            }
+
+            let d = d as usize;
+            for j in 0 .. i + 1 {
+                self.perm.p[j] = if j + d <= i {pp[j + d]} else {pp[j+d-i-1]} as i32;
+            }
+        }
+
+        self.perm
+    }
+
+    fn count(&self) -> u32 { self.permcount }
+    fn max(&self) -> u32 { self.fact[self.n as usize] }
+
+    fn next(&mut self) -> P {
+        next_permutation(&mut self.perm.p, &mut self.cnt);
+        self.permcount += 1;
+
+        self.perm
+    }
+}
+
+
+fn reverse(tperm: &mut [i32], k: usize) {
+    tperm[..k].reverse()
+}
+
+fn work(mut perm: Perm, n: usize, max: usize) -> (i32, i32) {
+    let mut checksum = 0;
+    let mut maxflips = 0;
+
+    let mut p = perm.get(n as i32);
+
+    while perm.count() < max as u32 {
+        let mut flips = 0;
+
+        while p.p[0] != 1 {
+            let k = p.p[0] as usize;
+            reverse(&mut p.p, k);
+            flips += 1;
+        }
+
+        checksum += if perm.count() % 2 == 0 {flips} else {-flips};
+        maxflips = cmp::max(maxflips, flips);
+
+        p = perm.next();
+    }
+
+    (checksum, maxflips)
+}
+
+fn fannkuch(n: i32) -> (i32, i32) {
+    let perm = Perm::new(n as u32);
+
+    let n = 1;
+    let mut futures = vec![];
+    let k = perm.max() / n;
+
+    for j in (0..).map(|x| x * k).take_while(|&j| j < k * n) {
+        let max = cmp::min(j+k, perm.max());
+
+        futures.push(thread::spawn(move|| {
+            work(perm, j as usize, max as usize)
+        }))
+    }
+
+    let mut checksum = 0;
+    let mut maxflips = 0;
+    for fut in futures.into_iter() {
+        let (cs, mf) = fut.join().unwrap();
+        checksum += cs;
+        maxflips = cmp::max(maxflips, mf);
+    }
+    (checksum, maxflips)
+}
+
+fn main() {
+    let n = std::env::args_os().nth(1)
+        .and_then(|s| s.into_string().ok())
+        .and_then(|n| n.parse().ok())
+        .unwrap_or(7);
+
+    let (checksum, maxflips) = fannkuch(n);
+    println!("{}\nPfannkuchen({}) = {}", checksum, n, maxflips);
+}
--- a/third_party/rust/simd/examples/fannkuch-redux.rs
+++ b/third_party/rust/simd/examples/fannkuch-redux.rs
@ -0,0 +1,233 @@
+#![feature(cfg_target_feature)]
+extern crate simd;
+#[macro_use] extern crate cfg_if;
+use simd::u8x16;
+
+use std::{env, process};
+
+cfg_if! {
+    if #[cfg(target_arch = "aarch64")] {
+        #[inline(always)]
+        fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
+            use simd::aarch64::neon::*;
+            y.table_lookup_1(x)
+        }
+    } else if #[cfg(all(target_arch = "arm",
+                 target_feature = "neon"))] {
+        #[inline(always)]
+        fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
+            use simd::arm::neon::*;
+            #[inline(always)]
+            fn split(x: u8x16) -> (u8x8, u8x8) {
+                unsafe {std::mem::transmute(x)}
+            }
+            fn join(x: u8x8, y: u8x8) -> u8x16 {
+                unsafe {std::mem::transmute((x, y))}
+            }
+
+            let (t0, t1) = split(x);
+            let (i0, i1) = split(y);
+            join(i0.table_lookup_2(t0, t1),
+                 i1.table_lookup_2(t0, t1))
+        }
+    } else if #[cfg(target_feature = "ssse3")] {
+        #[inline(always)]
+        fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
+            use simd::x86::ssse3::*;
+            x.shuffle_bytes(y)
+        }
+    } else {
+        // slow fallback, so tests work
+        #[inline(always)]
+        fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
+            u8x16::new(x.extract(y.extract(0) as u32),
+                       x.extract(y.extract(1) as u32),
+                       x.extract(y.extract(2) as u32),
+                       x.extract(y.extract(3) as u32),
+                       x.extract(y.extract(4) as u32),
+                       x.extract(y.extract(5) as u32),
+                       x.extract(y.extract(6) as u32),
+                       x.extract(y.extract(7) as u32),
+                       x.extract(y.extract(8) as u32),
+                       x.extract(y.extract(9) as u32),
+                       x.extract(y.extract(10) as u32),
+                       x.extract(y.extract(11) as u32),
+                       x.extract(y.extract(12) as u32),
+                       x.extract(y.extract(13) as u32),
+                       x.extract(y.extract(14) as u32),
+                       x.extract(y.extract(15) as u32))
+        }
+    }
+}
+struct State {
+    s: [u8; 16],
+    flip_masks: [u8x16; 16],
+    rotate_masks: [u8x16; 16],
+
+    maxflips: i32,
+    odd: u16,
+    checksum: i32,
+}
+impl State {
+    fn new() -> State {
+        State {
+            s: [0; 16],
+            flip_masks: [u8x16::splat(0); 16],
+            rotate_masks: [u8x16::splat(0); 16],
+
+            maxflips: 0,
+            odd: 0,
+            checksum: 0,
+        }
+    }
+    #[inline(never)]
+    fn rotate_sisd(&mut self, n: usize) {
+        let c = self.s[0];
+        for i in 1..(n + 1) {
+            self.s[i - 1] = self.s[i];
+        }
+        self.s[n] = c;
+    }
+    #[inline(never)]
+    fn popmasks(&mut self) {
+        let mut mask = [0_u8; 16];
+        for i in 0..16 {
+            for j in 0..16 { mask[j] = j as u8; }
+
+            for x in 0..(i+1)/2 {
+                mask.swap(x, i - x);
+            }
+
+            self.flip_masks[i] = u8x16::load(&mask, 0);
+
+            for j in 0..16 { self.s[j] = j as u8; }
+            self.rotate_sisd(i);
+            self.rotate_masks[i] = self.load_s();
+        }
+    }
+    fn rotate(&mut self, n: usize) {
+        shuffle(self.load_s(), self.rotate_masks[n]).store(&mut self.s, 0)
+    }
+
+    fn load_s(&self) -> u8x16 {
+        u8x16::load(&self.s, 0)
+    }
+
+
+    #[inline(never)]
+    fn tk(&mut self, n: usize) {
+        #[derive(Copy, Clone, Debug)]
+        struct Perm {
+            perm: u8x16,
+            start: u8,
+            odd: u16
+        }
+
+        let mut perms = [Perm { perm: u8x16::splat(0), start: 0 , odd: 0 }; 60];
+
+        let mut i = 0;
+        let mut c = [0_u8; 16];
+        let mut perm_max = 0;
+
+        while i < n {
+            while i < n && perm_max < 60 {
+                self.rotate(i);
+                if c[i] as usize >= i {
+                    c[i] = 0;
+                    i += 1;
+                    continue
+                }
+
+                c[i] += 1;
+                i = 1;
+                self.odd = !self.odd;
+                if self.s[0] != 0 {
+                    if self.s[self.s[0] as usize] != 0 {
+                        perms[perm_max].perm = self.load_s();
+                        perms[perm_max].start = self.s[0];
+                        perms[perm_max].odd = self.odd;
+                        perm_max += 1;
+                    } else {
+                        if self.maxflips == 0 { self.maxflips = 1 }
+                        self.checksum += if self.odd != 0 { -1 } else { 1 };
+                    }
+                }
+            }
+
+            let mut k = 0;
+            while k < std::cmp::max(1, perm_max) - 1 {
+                let pk = &perms[k];
+                let pk1 = &perms[k + 1];
+                //println!("perm1 {:?}\nperm2 {:?}", pk.perm, pk1.perm);
+                let mut perm1 = pk.perm;
+                let mut perm2 = pk1.perm;
+
+                let mut f1 = 0;
+                let mut f2 = 0;
+                let mut toterm1 = pk.start;
+                let mut toterm2 = pk1.start;
+
+                while toterm1 != 0 && toterm2 != 0 {
+                    perm1 = shuffle(perm1, self.flip_masks[toterm1 as usize]);
+                    perm2 = shuffle(perm2, self.flip_masks[toterm2 as usize]);
+                    toterm1 = perm1.extract(0);
+                    toterm2 = perm2.extract(0);
+
+                    f1 += 1; f2 += 1;
+                }
+                while toterm1 != 0 {
+                    perm1 = shuffle(perm1, self.flip_masks[toterm1 as usize]);
+                    toterm1 = perm1.extract(0);
+                    f1 += 1;
+                }
+                while toterm2 != 0 {
+                    perm2 = shuffle(perm2, self.flip_masks[toterm2 as usize]);
+                    toterm2 = perm2.extract(0);
+                    f2 += 1;
+                }
+
+                if f1 > self.maxflips { self.maxflips = f1 }
+                if f2 > self.maxflips { self.maxflips = f2 }
+                self.checksum += if pk.odd != 0 { -f1 } else { f1 };
+                self.checksum += if pk1.odd != 0 { -f2 } else { f2 };
+
+                k += 2;
+            }
+            while k < perm_max {
+                let pk = &perms[k];
+                let mut perm = pk.perm;
+                let mut f = 0;
+                let mut toterm = pk.start;
+                while toterm != 0 {
+                    perm = shuffle(perm, self.flip_masks[toterm as usize]);
+                    toterm = perm.extract(0);
+                    f += 1;
+                }
+                if f > self.maxflips { self.maxflips = f }
+                self.checksum += if pk.odd != 0 { -f } else { f };
+                k += 1
+            }
+            perm_max = 0;
+        }
+    }
+}
+
+fn main() {
+    let mut state = State::new();
+    state.popmasks();
+
+    let args = env::args().collect::<Vec<_>>();
+    if args.len() < 2 {
+        println!("usage: {} number", args[0]);
+        process::exit(1)
+    }
+    let max_n = args[1].parse().unwrap();
+    if max_n < 3 || max_n > 15 {
+        println!("range: must be 3 <= n <= 14");
+        process::exit(1);
+    }
+    for i in 0..max_n { state.s[i] = i as u8 }
+    state.tk(max_n);
+
+    println!("{}\nPfannkuchen({}) = {}", state.checksum, max_n, state.maxflips);
+}
--- a/third_party/rust/simd/examples/mandelbrot.rs
+++ b/third_party/rust/simd/examples/mandelbrot.rs
@ -0,0 +1,125 @@
+#![feature(step_by, test)]
+
+extern crate test;
+extern crate simd;
+use simd::{f32x4, u32x4};
+use std::io::prelude::*;
+
+#[inline(never)]
+fn mandelbrot_naive(c_x: f32, c_y: f32, max_iter: u32) -> u32 {
+    let mut x = c_x;
+    let mut y = c_y;
+    let mut count = 0;
+    while count < max_iter {
+        let xy = x * y;
+        let xx = x * x;
+        let yy = y * y;
+        let sum = xx + yy;
+        if sum > 4.0 {
+            break
+        }
+        count += 1;
+        x = xx - yy + c_x;
+        y = xy * 2.0 + c_y;
+    }
+    count
+}
+
+#[inline(never)]
+fn mandelbrot_vector(c_x: f32x4, c_y: f32x4, max_iter: u32) -> u32x4 {
+    let mut x = c_x;
+    let mut y = c_y;
+
+    let mut count = u32x4::splat(0);
+    for _ in 0..max_iter as usize {
+        let xy = x * y;
+        let xx = x * x;
+        let yy = y * y;
+        let sum = xx + yy;
+        let mask = sum.lt(f32x4::splat(4.0));
+
+        if !mask.any() { break }
+        count = count + mask.to_i().select(u32x4::splat(1),
+                                           u32x4::splat(0));
+
+        x = xx - yy + c_x;
+        y = xy + xy + c_y;
+    }
+    count
+}
+
+const COLOURS: &'static [(f32, f32, f32)] = &[(0.0, 7.0, 100.0),
+                                              (32.0, 107.0, 203.0),
+                                              (237.0, 255.0, 255.0),
+                                              (255.0, 170.0, 0.0),
+                                              (0.0, 2.0, 0.0)];
+const SCALE: f32 = 12.0;
+const LIMIT: u32 = 100;
+
+#[inline(never)]
+fn output_one(buf: &mut [u8], val: u32) {
+    let (r, g, b);
+    if val == LIMIT {
+        r = 0;
+        g = 0;
+        b = 0;
+    } else {
+        let val = (val as f32 % SCALE) * (COLOURS.len() as f32) / SCALE;
+        let left = val as usize % COLOURS.len();
+        let right = (left + 1) % COLOURS.len();
+
+        let p = val - left as f32;
+        let (r1, g1, b1) = COLOURS[left];
+        let (r2, g2, b2) = COLOURS[right];
+        r = (r1 + (r2 - r1) * p) as u8;
+        g = (g1 + (g2 - g1) * p) as u8;
+        b = (b1 + (b2 - b1) * p) as u8;
+    }
+    buf[0] = r;
+    buf[1] = g;
+    buf[2] = b;
+}
+
+fn main() {
+    let mut args = std::env::args();
+    args.next();
+    let width = args.next().unwrap().parse().unwrap();
+    let height = args.next().unwrap().parse().unwrap();
+
+    let left = -2.2;
+    let right = left + 3.0;
+    let top = 1.0;
+    let bottom = top - 2.0;
+
+    let width_step: f32 = (right - left) / width as f32;
+    let height_step: f32 = (bottom - top) / height as f32;
+
+    let adjust = f32x4::splat(width_step) * f32x4::new(0., 1., 2., 3.);
+
+    println!("P6 {} {} 255", width, height);
+    let mut line = vec![0; width * 3];
+
+    if args.next().is_none() {
+        for i in 0..height {
+            let y = f32x4::splat(top + height_step * i as f32);
+            for j in (0..width).step_by(4) {
+                let x = f32x4::splat(left + width_step * j as f32) + adjust;
+                let ret = mandelbrot_vector(x, y, LIMIT);
+                test::black_box(ret);
+                for k in 0..4 { let val = ret.extract(k as u32); output_one(&mut line[3*(j + k)..3*(j + k + 1)], val); }
+            }
+            ::std::io::stdout().write(&line).unwrap();
+        }
+    } else {
+        for i in 0..height {
+            let y = top + height_step * i as f32;
+            for j in 0..width {
+                let x = left + width_step * j as f32;
+                let val = mandelbrot_naive(x, y, LIMIT);
+                test::black_box(val);
+                output_one(&mut line[3*j..3*(j + 1)], val);
+            }
+            ::std::io::stdout().write(&line).unwrap();
+        }
+    }
+}
--- a/third_party/rust/simd/examples/matrix-inverse.rs
+++ b/third_party/rust/simd/examples/matrix-inverse.rs
@ -0,0 +1,280 @@
+extern crate simd;
+use simd::f32x4;
+
+fn mul(x: &[f32x4; 4], y: &[f32x4; 4]) -> [f32x4; 4] {
+    let y0 = y[0];
+    let y1 = y[1];
+    let y2 = y[2];
+    let y3 = y[3];
+    [f32x4::splat(y0.extract(0)) * x[0] +
+     f32x4::splat(y0.extract(1)) * x[1] +
+     f32x4::splat(y0.extract(2)) * x[2] +
+     f32x4::splat(y0.extract(3)) * x[3],
+     f32x4::splat(y1.extract(0)) * x[0] +
+     f32x4::splat(y1.extract(1)) * x[1] +
+     f32x4::splat(y1.extract(2)) * x[2] +
+     f32x4::splat(y1.extract(3)) * x[3],
+     f32x4::splat(y2.extract(0)) * x[0] +
+     f32x4::splat(y2.extract(1)) * x[1] +
+     f32x4::splat(y2.extract(2)) * x[2] +
+     f32x4::splat(y2.extract(3)) * x[3],
+     f32x4::splat(y3.extract(0)) * x[0] +
+     f32x4::splat(y3.extract(1)) * x[1] +
+     f32x4::splat(y3.extract(2)) * x[2] +
+     f32x4::splat(y3.extract(3)) * x[3],
+     ]
+}
+
+fn inverse_naive(x: &[[f32; 4]; 4]) -> [[f32; 4]; 4] {
+    let mut t = [[0_f32; 4]; 4];
+    for i in 0..4 {
+        t[0][i] = x[i][0];
+        t[1][i] = x[i][1];
+        t[2][i] = x[i][2];
+        t[3][i] = x[i][3];
+    }
+    println!("{:?}", t);
+
+    let _0 = t[2][2] * t[3][3];
+    let _1 = t[2][3] * t[3][2];
+    let _2 = t[2][1] * t[3][3];
+    let _3 = t[2][3] * t[3][1];
+    let _4 = t[2][1] * t[3][2];
+    let _5 = t[2][2] * t[3][1];
+    let _6 = t[2][0] * t[3][3];
+    let _7 = t[2][3] * t[3][0];
+    let _8 = t[2][0] * t[3][2];
+    let _9 = t[2][2] * t[3][0];
+    let _10 = t[2][0] * t[3][1];
+    let _11 = t[2][1] * t[3][0];
+    let v = [_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11];
+    println!("{:?}", v);
+
+    let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
+        (_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
+    let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
+        (_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
+    let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
+        (_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
+    let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
+        (_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
+    let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
+        (_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
+    let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
+        (_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
+    let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
+        (_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
+    let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
+        (_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);
+
+    println!("{:?}", [d00, d01, d02, d03, d10, d11, d12, d13]);
+
+    let _0 = t[0][2] * t[1][3];
+    let _1 = t[0][3] * t[1][2];
+    let _2 = t[0][1] * t[1][3];
+    let _3 = t[0][3] * t[1][1];
+    let _4 = t[0][1] * t[1][2];
+    let _5 = t[0][2] * t[1][1];
+    let _6 = t[0][0] * t[1][3];
+    let _7 = t[0][3] * t[1][0];
+    let _8 = t[0][0] * t[1][2];
+    let _9 = t[0][2] * t[1][0];
+    let _10 = t[0][0] * t[1][1];
+    let _11 = t[0][1] * t[1][0];
+    let v = [_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11];
+    println!("{:?}", v);
+
+    let d20  = _0*t[3][1]  + _3*t[3][2]  + _4*t[3][3]-
+        (_1*t[3][1]  + _2*t[3][2]  + _5*t[3][3]);
+    let d21  = _1*t[3][0]  + _6*t[3][2]  + _9*t[3][3]-
+        (_0*t[3][0]  + _7*t[3][2]  + _8*t[3][3]);
+    let d22 = _2*t[3][0]  + _7*t[3][1]  + _10*t[3][3]-
+        (_3*t[3][0]  + _6*t[3][1]  + _11*t[3][3]);
+    let d23 = _5*t[3][0]  + _8*t[3][1]  + _11*t[3][2]-
+        (_4*t[3][0]  + _9*t[3][1]  + _10*t[3][2]);
+    let d30 = _2*t[2][2]  + _5*t[2][3]  + _1*t[2][1]-
+        (_4*t[2][3]  + _0*t[2][1]   + _3*t[2][2]);
+    let d31 = _8*t[2][3]  + _0*t[2][0]   + _7*t[2][2]-
+        (_6*t[2][2]  + _9*t[2][3]  + _1*t[2][0]);
+    let d32 = _6*t[2][1]   + _11*t[2][3] + _3*t[2][0]-
+        (_10*t[2][3] + _2*t[2][0]   + _7*t[2][1]);
+    let d33 = _10*t[2][2] + _4*t[2][0]   + _9*t[2][1]-
+        (_8*t[2][1]   + _11*t[2][2] + _5*t[2][0]);
+
+    println!("{:?}", [d20, d21, d22, d23, d30, d31, d32, d33]);
+
+    let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;
+
+    let det = 1.0 / det;
+    let mut ret = [[d00, d01, d02, d03],
+                   [d10, d11, d12, d13],
+                   [d20, d21, d22, d23],
+                   [d30, d31, d32, d33]];
+    for i in 0..4 {
+        for j in 0..4 {
+            ret[i][j] *= det;
+        }
+    }
+    ret
+}
+
+fn inverse_simd4(x: &[f32x4; 4]) -> [f32x4; 4] {
+    let src0 = x[0];
+    let src1 = x[1];
+    let src2 = x[2];
+    let src3 = x[3];
+
+    let tmp1 = f32x4::new(src0.extract(0), src0.extract(1),
+                          src1.extract(4 - 4), src1.extract(5 - 4));
+    let row1 = f32x4::new(src2.extract(0), src2.extract(1),
+                          src3.extract(4 - 4), src3.extract(5 - 4));
+    let row0 = f32x4::new(tmp1.extract(0), tmp1.extract(2),
+                          row1.extract(4 - 4), row1.extract(6 - 4));
+    let row1 = f32x4::new(row1.extract(1), row1.extract(3),
+                          tmp1.extract(5 - 4), tmp1.extract(7 - 4));
+
+    let tmp1 = f32x4::new(src0.extract(2), src0.extract(3),
+                          src1.extract(6 - 4), src1.extract(7 - 4));
+    let row3 = f32x4::new(src2.extract(2), src2.extract(3),
+                          src3.extract(6 - 4), src3.extract(7 - 4));
+    let row2 = f32x4::new(tmp1.extract(0), tmp1.extract(2),
+                          row3.extract(4 - 4), row3.extract(6 - 4));
+    let row3 = f32x4::new(row3.extract(1), row3.extract(3),
+                          tmp1.extract(5 - 4), tmp1.extract(7 - 4));
+
+
+    let tmp1 = row2 * row3;
+    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
+                          tmp1.extract(3), tmp1.extract(2));
+    let minor0 = row1 * tmp1;
+    let minor1 = row0 * tmp1;
+    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
+                          tmp1.extract(0), tmp1.extract(1));
+    let minor0 = (row1 * tmp1) - minor0;
+    let minor1 = (row0 * tmp1) - minor1;
+    let minor1 = f32x4::new(minor1.extract(2), minor1.extract(3),
+                            minor1.extract(0), minor1.extract(1));
+    //println!("{:?}", minor1);
+
+
+    let tmp1 = row1 * row2;
+    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
+                          tmp1.extract(3), tmp1.extract(2));
+    let minor0 = (row3 * tmp1) + minor0;
+    let minor3 = row0 * tmp1;
+    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
+                          tmp1.extract(0), tmp1.extract(1));
+
+    let minor0 = minor0 - row3 * tmp1;
+    let minor3 = row0 * tmp1 - minor3;
+    let minor3 = f32x4::new(minor3.extract(2), minor3.extract(3),
+                            minor3.extract(0), minor3.extract(1));
+    //println!("{:?}", minor1);
+
+
+    let tmp1 = row3 * f32x4::new(row1.extract(2), row1.extract(3),
+                                 row1.extract(0), row1.extract(1));
+    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
+                          tmp1.extract(3), tmp1.extract(2));
+    let row2 = f32x4::new(row2.extract(2), row2.extract(3),
+                          row2.extract(0), row2.extract(1));
+    let minor0 = row2 * tmp1 + minor0;
+    let minor2 = row0 * tmp1;
+    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
+                          tmp1.extract(0), tmp1.extract(1));
+    let minor0 = minor0 - row2 * tmp1;
+    let minor2 = row0 * tmp1 - minor2;
+    let minor2 = f32x4::new(minor2.extract(2), minor2.extract(3),
+                            minor2.extract(0), minor2.extract(1));
+    //println!("{:?}", minor1);
+
+
+    let tmp1 = row0 * row1;
+    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
+                          tmp1.extract(3), tmp1.extract(2));
+    let minor2 = minor2 + row3 * tmp1;
+    let minor3 = row2 * tmp1 - minor3;
+    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
+                          tmp1.extract(0), tmp1.extract(1));
+    let minor2 = row3 * tmp1 - minor2;
+    let minor3 = minor3 - row2 * tmp1;
+    //println!("{:?}", minor1);
+
+
+
+    let tmp1 = row0 * row3;
+    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
+                          tmp1.extract(3), tmp1.extract(2));
+    let minor1 = minor1 - row2 * tmp1;
+    let minor2 = row1 * tmp1 + minor2;
+    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
+                          tmp1.extract(0), tmp1.extract(1));
+    let minor1 = row2 * tmp1 + minor1;
+    let minor2 = minor2 - row1 * tmp1;
+    //println!("{:?}", minor1);
+
+    let tmp1 = row0 * row2;
+    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
+                          tmp1.extract(3), tmp1.extract(2));
+    let minor1 = row3 * tmp1 + minor1;
+    let minor3 = minor3 - row1 * tmp1;
+    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
+                          tmp1.extract(0), tmp1.extract(1));
+    let minor1 = minor1 - row3 * tmp1;
+    let minor3 = row1 * tmp1 + minor3;
+    //println!("{:?}", minor1);
+
+    let det = row0 * minor0;
+    let det = f32x4::new(det.extract(2), det.extract(3),
+                         det.extract(0), det.extract(1)) + det;
+    let det = f32x4::new(det.extract(1), det.extract(0),
+                         det.extract(3), det.extract(2)) + det;
+    let tmp1 = det.approx_reciprocal();
+    let det = tmp1 + tmp1 - det * tmp1 * tmp1;
+
+//    let det = f32x4::splat(det.extract(0));
+
+    [minor0 * det, minor1 * det, minor2 * det, minor3 * det]
+}
+
+fn p(x: &[f32x4; 4]) {
+    for xx in x {
+        for i in 0..4 {
+            let v = xx.extract(i);
+            if v == 0.0 {
+                print!("{}{:6.2}", if i > 0 {", "} else {"|"}, "");
+            } else {
+                print!("{}{:6.2}", if i > 0 {", "} else {"|"}, xx.extract(i));
+            }
+        }
+        println!(" |");
+    }
+}
+
+fn main() {
+    let x = [f32x4::new(-100.0, 6.0, 100.0, 1.0),
+             f32x4::new(3.0, 1.0, 0.0, 1.0),
+             f32x4::new(2.0, 1.0, 1.0, 1.0),
+             f32x4::new(-10.0, 1.0, 1.0, 1.0)];
+
+  /*  let mut x_ = [[0.0; 4]; 4];
+    for i in 0..4 {
+        for j in 0..4 {
+            x_[i][j] = x[i].extract(j as u32)
+        }
+    }
+
+    let ret = inverse_naive(&x_);
+    let mut y = [f32x4::splat(0.0); 4];
+    for i in 0..4 {
+        for j in 0..4 {
+            y[i] = y[i].replace(j as u32, ret[i][j])
+        }
+}*/
+    let y = inverse_simd4(&x);
+    p(&x);
+    println!("");
+    p(&y);
+    println!("");
+    p(&mul(&x, &y))
+}
--- a/third_party/rust/simd/examples/nbody-nosimd.rs
+++ b/third_party/rust/simd/examples/nbody-nosimd.rs
@ -0,0 +1,156 @@
+// The Computer Language Benchmarks Game
+// http://benchmarksgame.alioth.debian.org/
+//
+// contributed by the Rust Project Developers
+// contributed by TeXitoi
+
+const PI: f64 = 3.141592653589793;
+const SOLAR_MASS: f64 = 4.0 * PI * PI;
+const YEAR: f64 = 365.24;
+const N_BODIES: usize = 5;
+
+static BODIES: [Planet;N_BODIES] = [
+    // Sun
+    Planet {
+        x: 0.0, y: 0.0, z: 0.0,
+        vx: 0.0, vy: 0.0, vz: 0.0,
+        mass: SOLAR_MASS,
+    },
+    // Jupiter
+    Planet {
+        x: 4.84143144246472090e+00,
+        y: -1.16032004402742839e+00,
+        z: -1.03622044471123109e-01,
+        vx: 1.66007664274403694e-03 * YEAR,
+        vy: 7.69901118419740425e-03 * YEAR,
+        vz: -6.90460016972063023e-05 * YEAR,
+        mass: 9.54791938424326609e-04 * SOLAR_MASS,
+    },
+    // Saturn
+    Planet {
+        x: 8.34336671824457987e+00,
+        y: 4.12479856412430479e+00,
+        z: -4.03523417114321381e-01,
+        vx: -2.76742510726862411e-03 * YEAR,
+        vy: 4.99852801234917238e-03 * YEAR,
+        vz: 2.30417297573763929e-05 * YEAR,
+        mass: 2.85885980666130812e-04 * SOLAR_MASS,
+    },
+    // Uranus
+    Planet {
+        x: 1.28943695621391310e+01,
+        y: -1.51111514016986312e+01,
+        z: -2.23307578892655734e-01,
+        vx: 2.96460137564761618e-03 * YEAR,
+        vy: 2.37847173959480950e-03 * YEAR,
+        vz: -2.96589568540237556e-05 * YEAR,
+        mass: 4.36624404335156298e-05 * SOLAR_MASS,
+    },
+    // Neptune
+    Planet {
+        x: 1.53796971148509165e+01,
+        y: -2.59193146099879641e+01,
+        z: 1.79258772950371181e-01,
+        vx: 2.68067772490389322e-03 * YEAR,
+        vy: 1.62824170038242295e-03 * YEAR,
+        vz: -9.51592254519715870e-05 * YEAR,
+        mass: 5.15138902046611451e-05 * SOLAR_MASS,
+    },
+];
+
+#[derive(Clone, Copy)]
+struct Planet {
+    x: f64, y: f64, z: f64,
+    vx: f64, vy: f64, vz: f64,
+    mass: f64,
+}
+
+fn advance(bodies: &mut [Planet;N_BODIES], dt: f64, steps: i32) {
+    for _ in (0..steps) {
+        let mut b_slice: &mut [_] = bodies;
+        loop {
+            let bi = match shift_mut_ref(&mut b_slice) {
+                Some(bi) => bi,
+                None => break
+            };
+            for bj in b_slice.iter_mut() {
+                let dx = bi.x - bj.x;
+                let dy = bi.y - bj.y;
+                let dz = bi.z - bj.z;
+
+                let d2 = dx * dx + dy * dy + dz * dz;
+                let mag = dt / (d2 * d2.sqrt());
+
+                let massj_mag = bj.mass * mag;
+                bi.vx -= dx * massj_mag;
+                bi.vy -= dy * massj_mag;
+                bi.vz -= dz * massj_mag;
+
+                let massi_mag = bi.mass * mag;
+                bj.vx += dx * massi_mag;
+                bj.vy += dy * massi_mag;
+                bj.vz += dz * massi_mag;
+            }
+            bi.x += dt * bi.vx;
+            bi.y += dt * bi.vy;
+            bi.z += dt * bi.vz;
+        }
+    }
+}
+
+fn energy(bodies: &[Planet;N_BODIES]) -> f64 {
+    let mut e = 0.0;
+    let mut bodies = bodies.iter();
+    loop {
+        let bi = match bodies.next() {
+            Some(bi) => bi,
+            None => break
+        };
+        e += (bi.vx * bi.vx + bi.vy * bi.vy + bi.vz * bi.vz) * bi.mass / 2.0;
+        for bj in bodies.clone() {
+            let dx = bi.x - bj.x;
+            let dy = bi.y - bj.y;
+            let dz = bi.z - bj.z;
+            let dist = (dx * dx + dy * dy + dz * dz).sqrt();
+            e -= bi.mass * bj.mass / dist;
+        }
+    }
+    e
+}
+
+fn offset_momentum(bodies: &mut [Planet;N_BODIES]) {
+    let mut px = 0.0;
+    let mut py = 0.0;
+    let mut pz = 0.0;
+    for bi in bodies.iter() {
+        px += bi.vx * bi.mass;
+        py += bi.vy * bi.mass;
+        pz += bi.vz * bi.mass;
+    }
+    let sun = &mut bodies[0];
+    sun.vx = - px / SOLAR_MASS;
+    sun.vy = - py / SOLAR_MASS;
+    sun.vz = - pz / SOLAR_MASS;
+}
+
+fn main() {
+    let n = std::env::args().nth(1).expect("need one arg").parse().unwrap();
+    let mut bodies = BODIES;
+
+    offset_momentum(&mut bodies);
+    println!("{:.9}", energy(&bodies));
+
+    advance(&mut bodies, 0.01, n);
+
+    println!("{:.9}", energy(&bodies));
+}
+
+/// Pop a mutable reference off the head of a slice, mutating the slice to no
+/// longer contain the mutable reference.
+fn shift_mut_ref<'a, T>(r: &mut &'a mut [T]) -> Option<&'a mut T> {
+    if r.len() == 0 { return None }
+    let tmp = std::mem::replace(r, &mut []);
+    let (h, t) = tmp.split_at_mut(1);
+    *r = t;
+    Some(&mut h[0])
+}
--- a/third_party/rust/simd/examples/nbody.rs
+++ b/third_party/rust/simd/examples/nbody.rs
@ -0,0 +1,170 @@
+#![feature(cfg_target_feature)]
+
+extern crate simd;
+
+#[cfg(target_feature = "sse2")]
+use simd::x86::sse2::*;
+#[cfg(target_arch = "aarch64")]
+use simd::aarch64::neon::*;
+
+const PI: f64 = 3.141592653589793;
+const SOLAR_MASS: f64 = 4.0 * PI * PI;
+const DAYS_PER_YEAR: f64 = 365.24;
+
+struct Body {
+    x: [f64; 3],
+    _fill: f64,
+    v: [f64; 3],
+    mass: f64,
+}
+
+impl Body {
+    fn new(x0: f64, x1: f64, x2: f64,
+           v0: f64, v1: f64, v2: f64,
+           mass: f64) -> Body {
+        Body {
+            x: [x0, x1, x2],
+            _fill: 0.0,
+            v: [v0, v1, v2],
+            mass: mass,
+        }
+    }
+}
+
+const N_BODIES: usize = 5;
+const N: usize = N_BODIES * (N_BODIES - 1) / 2;
+fn offset_momentum(bodies: &mut [Body; N_BODIES]) {
+    let (sun, rest) = bodies.split_at_mut(1);
+    let sun = &mut sun[0];
+    for body in rest {
+        for k in 0..3 {
+            sun.v[k] -= body.v[k] * body.mass / SOLAR_MASS;
+        }
+    }
+}
+fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
+    let mut r = [[0.0; 4]; N];
+    let mut mag = [0.0; N];
+
+    let mut dx = [f64x2::splat(0.0); 3];
+    let mut dsquared;
+    let mut distance;
+    let mut dmag;
+
+    let mut i = 0;
+    for j in 0..N_BODIES {
+        for k in j+1..N_BODIES {
+            for m in 0..3 {
+                r[i][m] = bodies[j].x[m] - bodies[k].x[m];
+            }
+            i += 1;
+        }
+    }
+
+    i = 0;
+    while i < N {
+        for m in 0..3 {
+            dx[m] = f64x2::new(r[i][m], r[i+1][m]);
+        }
+
+        dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+        distance = dsquared.to_f32().approx_rsqrt().to_f64();
+        for _ in 0..2 {
+            distance = distance * f64x2::splat(1.5) -
+                ((f64x2::splat(0.5) * dsquared) * distance) * (distance * distance)
+        }
+        dmag = f64x2::splat(dt) / dsquared * distance;
+        dmag.store(&mut mag, i);
+
+        i += 2;
+    }
+
+    i = 0;
+    for j in 0..N_BODIES {
+        for k in j+1..N_BODIES {
+            for m in 0..3 {
+                bodies[j].v[m] -= r[i][m] * bodies[k].mass * mag[i];
+                bodies[k].v[m] += r[i][m] * bodies[j].mass * mag[i];
+            }
+            i += 1
+        }
+    }
+    for body in bodies {
+        for m in 0..3 {
+            body.x[m] += dt * body.v[m]
+        }
+    }
+}
+
+fn energy(bodies: &[Body; N_BODIES]) -> f64 {
+    let mut e = 0.0;
+    for i in 0..N_BODIES {
+        let bi = &bodies[i];
+        e += bi.mass * (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2]) / 2.0;
+        for j in i+1..N_BODIES {
+            let bj = &bodies[j];
+            let mut dx = [0.0; 3];
+            for k in 0..3 {
+                dx[k] = bi.x[k] - bj.x[k];
+            }
+            let mut distance = 0.0;
+            for &d in &dx { distance += d * d }
+            e -= bi.mass * bj.mass / distance.sqrt()
+        }
+    }
+    e
+}
+
+fn main() {
+    let mut bodies: [Body; N_BODIES] = [
+        /* sun */
+        Body::new(0.0, 0.0, 0.0,
+                  0.0, 0.0, 0.0,
+                  SOLAR_MASS),
+        /* jupiter */
+        Body::new(4.84143144246472090e+00,
+                  -1.16032004402742839e+00,
+                  -1.03622044471123109e-01 ,
+                  1.66007664274403694e-03 * DAYS_PER_YEAR,
+                  7.69901118419740425e-03 * DAYS_PER_YEAR,
+                  -6.90460016972063023e-05 * DAYS_PER_YEAR ,
+                  9.54791938424326609e-04 * SOLAR_MASS
+                  ),
+        /* saturn */
+        Body::new(8.34336671824457987e+00,
+                  4.12479856412430479e+00,
+                  -4.03523417114321381e-01 ,
+                  -2.76742510726862411e-03 * DAYS_PER_YEAR,
+                  4.99852801234917238e-03 * DAYS_PER_YEAR,
+                  2.30417297573763929e-05 * DAYS_PER_YEAR ,
+                  2.85885980666130812e-04 * SOLAR_MASS
+                  ),
+        /* uranus */
+        Body::new(1.28943695621391310e+01,
+                  -1.51111514016986312e+01,
+                  -2.23307578892655734e-01 ,
+                  2.96460137564761618e-03 * DAYS_PER_YEAR,
+                  2.37847173959480950e-03 * DAYS_PER_YEAR,
+                  -2.96589568540237556e-05 * DAYS_PER_YEAR ,
+                  4.36624404335156298e-05 * SOLAR_MASS
+                  ),
+        /* neptune */
+        Body::new(1.53796971148509165e+01,
+                  -2.59193146099879641e+01,
+                  1.79258772950371181e-01 ,
+                  2.68067772490389322e-03 * DAYS_PER_YEAR,
+                  1.62824170038242295e-03 * DAYS_PER_YEAR,
+                  -9.51592254519715870e-05 * DAYS_PER_YEAR ,
+                  5.15138902046611451e-05 * SOLAR_MASS
+                  )
+            ];
+
+    let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
+
+    offset_momentum(&mut bodies);
+    println!("{:.9}", energy(&bodies));
+    for _ in 0..n {
+        advance(&mut bodies, 0.01);
+    }
+    println!("{:.9}", energy(&bodies));
+}
--- a/third_party/rust/simd/examples/ops.rs
+++ b/third_party/rust/simd/examples/ops.rs
@ -0,0 +1,9 @@
+extern crate simd;
+
+use simd::*;
+
+fn main() {
+    let x = i32x4::splat(1_i32);
+    let y = -x;
+    let z = !x;
+}
--- a/third_party/rust/simd/examples/spectral-norm-nosimd.rs
+++ b/third_party/rust/simd/examples/spectral-norm-nosimd.rs
@ -0,0 +1,106 @@
+// The Computer Language Benchmarks Game
+// http://benchmarksgame.alioth.debian.org/
+//
+// contributed by the Rust Project Developers
+// contributed by TeXitoi
+
+#![allow(non_snake_case)]
+
+use std::iter::repeat;
+//use std::thread;
+
+// As std::simd::f64x2 is unstable, we provide a similar interface,
+// expecting llvm to autovectorize its usage.
+#[allow(non_camel_case_types)]
+struct f64x2(f64, f64);
+impl std::ops::Add for f64x2 {
+    type Output = Self;
+    fn add(self, rhs: Self) -> Self {
+        f64x2(self.0 + rhs.0, self.1 + rhs.1)
+    }
+}
+impl std::ops::Div for f64x2 {
+    type Output = Self;
+    fn div(self, rhs: Self) -> Self {
+        f64x2(self.0 / rhs.0, self.1 / rhs.1)
+    }
+}
+
+fn main() {
+    let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
+    let answer = spectralnorm(n);
+    println!("{:.9}", answer);
+}
+
+fn spectralnorm(n: usize) -> f64 {
+    assert!(n % 2 == 0, "only even lengths are accepted");
+    let mut u = repeat(1.0).take(n).collect::<Vec<_>>();
+    let mut v = u.clone();
+    let mut tmp = v.clone();
+    for _ in 0..10 {
+        mult_AtAv(&u, &mut v, &mut tmp);
+        mult_AtAv(&v, &mut u, &mut tmp);
+    }
+    (dot(&u, &v) / dot(&v, &v)).sqrt()
+}
+
+fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
+    mult_Av(v, tmp);
+    mult_Atv(tmp, out);
+}
+
+fn mult_Av(v: &[f64], out: &mut [f64]) {
+    parallel(out, |start, out| mult(v, out, start, |i, j| A(i, j)));
+}
+
+fn mult_Atv(v: &[f64], out: &mut [f64]) {
+    parallel(out, |start, out| mult(v, out, start, |i, j| A(j, i)));
+}
+
+fn mult<F>(v: &[f64], out: &mut [f64], start: usize, a: F)
+           where F: Fn(usize, usize) -> f64 {
+    for (i, slot) in out.iter_mut().enumerate().map(|(i, s)| (i + start, s)) {
+        let mut sum = f64x2(0.0, 0.0);
+        for (j, chunk) in v.chunks(2).enumerate().map(|(j, s)| (2 * j, s)) {
+            let top = f64x2(chunk[0], chunk[1]);
+            let bot = f64x2(a(i, j), a(i, j + 1));
+            sum = sum + top / bot;
+        }
+        let f64x2(a, b) = sum;
+        *slot = a + b;
+    }
+}
+
+fn A(i: usize, j: usize) -> f64 {
+    ((i + j) * (i + j + 1) / 2 + i + 1) as f64
+}
+
+fn dot(v: &[f64], u: &[f64]) -> f64 {
+    v.iter().zip(u.iter()).map(|(a, b)| *a * *b).fold(0., |acc, i| acc + i)
+}
+
+//struct Racy<T>(T);
+//unsafe impl<T: 'static> Send for Racy<T> {}
+
+// Executes a closure in parallel over the given mutable slice. The closure `f`
+// is run in parallel and yielded the starting index within `v` as well as a
+// sub-slice of `v`.
+fn parallel<'a, T, F>(v: &mut [T], ref f: F)
+    where T: 'static + Send + Sync,
+F: Fn(usize, &mut [T]) + Sync
+{
+    f(0, v);
+    /*let size = v.len() / 4 + 1;
+    let jhs = v.chunks_mut(size).enumerate().map(|(i, chunk)| {
+        // Need to convert `f` and `chunk` to something that can cross the task
+        // boundary.
+        let f = Racy(f as *const F as *const usize);
+        let raw = Racy((&mut chunk[0] as *mut T, chunk.len()));
+        thread::spawn(move|| {
+            let f = f.0 as *const F;
+            let raw = raw.0;
+            unsafe { (*f)(i * size, std::slice::from_raw_parts_mut(raw.0, raw.1)) }
+        })
+    }).collect::<Vec<_>>();
+    for jh in jhs { jh.join().unwrap(); }*/
+}
--- a/third_party/rust/simd/examples/spectral-norm.rs
+++ b/third_party/rust/simd/examples/spectral-norm.rs
@ -0,0 +1,74 @@
+#![feature(cfg_target_feature)]
+#![allow(non_snake_case)]
+
+extern crate simd;
+
+#[cfg(target_feature = "sse2")]
+use simd::x86::sse2::f64x2;
+#[cfg(target_arch = "aarch64")]
+use simd::aarch64::neon::f64x2;
+
+fn A(i: usize, j: usize) -> f64 {
+    ((i + j) * (i + j + 1) / 2 + i + 1) as f64
+}
+
+fn dot(x: &[f64], y: &[f64]) -> f64 {
+    x.iter().zip(y).map(|(&x, &y)| x * y).fold(0.0, |a, b| a + b)
+}
+
+fn mult_Av(v: &[f64], out: &mut [f64]) {
+    assert!(v.len() == out.len());
+    assert!(v.len() % 2 == 0);
+
+    for i in 0..v.len() {
+        let mut sum = f64x2::splat(0.0);
+
+        let mut j = 0;
+        while j < v.len() {
+            let b = f64x2::load(v, j);
+            let a = f64x2::new(A(i, j), A(i, j + 1));
+            sum = sum + b / a;
+            j += 2
+        }
+        out[i] = sum.extract(0) + sum.extract(1);
+    }
+}
+
+fn mult_Atv(v: &[f64], out: &mut [f64]) {
+    assert!(v.len() == out.len());
+    assert!(v.len() % 2 == 0);
+
+    for i in 0..v.len() {
+        let mut sum = f64x2::splat(0.0);
+
+        let mut j = 0;
+        while j < v.len() {
+            let b = f64x2::load(v, j);
+            let a = f64x2::new(A(j, i), A(j + 1, i));
+            sum = sum + b / a;
+            j += 2
+        }
+        out[i] = sum.extract(0) + sum.extract(1);
+    }
+}
+
+fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
+    mult_Av(v, tmp);
+    mult_Atv(tmp, out);
+}
+
+fn main() {
+    let mut n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
+    if n % 2 == 1 { n += 1 }
+
+    let mut u = vec![1.0; n];
+    let mut v = u.clone();
+    let mut tmp = u.clone();
+
+    for _ in 0..10 {
+        mult_AtAv(&u, &mut v, &mut tmp);
+        mult_AtAv(&v, &mut u, &mut tmp);
+    }
+
+    println!("{:.9}", (dot(&u, &v) / dot(&v, &v)).sqrt());
+}
--- a/third_party/rust/simd/src/aarch64/mod.rs
+++ b/third_party/rust/simd/src/aarch64/mod.rs
@ -0,0 +1,3 @@
+//! Features specific to AArch64 CPUs.
+
+pub mod neon;
--- a/third_party/rust/simd/src/aarch64/neon.rs
+++ b/third_party/rust/simd/src/aarch64/neon.rs
@ -0,0 +1,681 @@
+use super::super::*;
+use {simd_cast, f32x2};
+
+pub use sixty_four::{f64x2, i64x2, u64x2, bool64ix2, bool64fx2};
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct u32x2(u32, u32);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct i32x2(i32, i32);
+
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct u16x4(u16, u16, u16, u16);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct i16x4(i16, i16, i16, i16);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct u8x8(u8, u8, u8, u8,
+            u8, u8, u8, u8);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct i8x8(i8, i8, i8, i8,
+                i8, i8, i8, i8);
+
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct i64x1(i64);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct u64x1(u64);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct f64x1(f64);
+
+#[allow(dead_code)]
+extern "platform-intrinsic" {
+    fn aarch64_vhadd_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vhadd_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vhadd_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vhadd_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vhadd_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vhadd_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vrhadd_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vrhadd_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vrhadd_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vrhadd_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vrhadd_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vrhadd_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vrhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vrhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vrhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vrhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vrhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vrhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vqadd_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vqadd_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vqadd_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vqadd_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vqadd_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vqadd_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vqadd_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn aarch64_vqadd_u64(x: u64x1, y: u64x1) -> u64x1;
+    fn aarch64_vqaddq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vqaddq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vqaddq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vqaddq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vqaddq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vqaddq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vqaddq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vqaddq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn aarch64_vuqadd_s8(x: i8x16, y: u8x16) -> i8x16;
+    fn aarch64_vuqadd_s16(x: i16x8, y: u16x8) -> i16x8;
+    fn aarch64_vuqadd_s32(x: i32x4, y: u32x4) -> i32x4;
+    fn aarch64_vuqadd_s64(x: i64x2, y: u64x2) -> i64x2;
+    fn aarch64_vsqadd_u8(x: u8x16, y: i8x16) -> u8x16;
+    fn aarch64_vsqadd_u16(x: u16x8, y: i16x8) -> u16x8;
+    fn aarch64_vsqadd_u32(x: u32x4, y: i32x4) -> u32x4;
+    fn aarch64_vsqadd_u64(x: u64x2, y: i64x2) -> u64x2;
+    fn aarch64_vraddhn_s16(x: i16x8, y: i16x8) -> i8x8;
+    fn aarch64_vraddhn_u16(x: u16x8, y: u16x8) -> u8x8;
+    fn aarch64_vraddhn_s32(x: i32x4, y: i32x4) -> i16x4;
+    fn aarch64_vraddhn_u32(x: u32x4, y: u32x4) -> u16x4;
+    fn aarch64_vraddhn_s64(x: i64x2, y: i64x2) -> i32x2;
+    fn aarch64_vraddhn_u64(x: u64x2, y: u64x2) -> u32x2;
+    fn aarch64_vfmulx_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vfmulx_f64(x: f64x1, y: f64x1) -> f64x1;
+    fn aarch64_vfmulxq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vfmulxq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vfma_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vfma_f64(x: f64x1, y: f64x1) -> f64x1;
+    fn aarch64_vfmaq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vfmaq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vqdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vqdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vqdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vqdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vqrdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vqrdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vqrdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vqrdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vmull_s8(x: i8x8, y: i8x8) -> i16x8;
+    fn aarch64_vmull_u8(x: u8x8, y: u8x8) -> u16x8;
+    fn aarch64_vmull_s16(x: i16x4, y: i16x4) -> i32x4;
+    fn aarch64_vmull_u16(x: u16x4, y: u16x4) -> u32x4;
+    fn aarch64_vmull_s32(x: i32x2, y: i32x2) -> i64x2;
+    fn aarch64_vmull_u32(x: u32x2, y: u32x2) -> u64x2;
+    fn aarch64_vqdmullq_s8(x: i8x8, y: i8x8) -> i16x8;
+    fn aarch64_vqdmullq_s16(x: i16x4, y: i16x4) -> i32x4;
+    fn aarch64_vhsub_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vhsub_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vhsub_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vhsub_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vhsub_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vhsub_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vhsubq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vhsubq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vhsubq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vhsubq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vhsubq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vhsubq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vqsub_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vqsub_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vqsub_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vqsub_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vqsub_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vqsub_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vqsub_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn aarch64_vqsub_u64(x: u64x1, y: u64x1) -> u64x1;
+    fn aarch64_vqsubq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vqsubq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vqsubq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vqsubq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vqsubq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vqsubq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vqsubq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vqsubq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn aarch64_vrsubhn_s16(x: i16x8, y: i16x8) -> i8x8;
+    fn aarch64_vrsubhn_u16(x: u16x8, y: u16x8) -> u8x8;
+    fn aarch64_vrsubhn_s32(x: i32x4, y: i32x4) -> i16x4;
+    fn aarch64_vrsubhn_u32(x: u32x4, y: u32x4) -> u16x4;
+    fn aarch64_vrsubhn_s64(x: i64x2, y: i64x2) -> i32x2;
+    fn aarch64_vrsubhn_u64(x: u64x2, y: u64x2) -> u32x2;
+    fn aarch64_vabd_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vabd_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vabd_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vabd_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vabd_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vabd_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vabd_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vabd_f64(x: f64x1, y: f64x1) -> f64x1;
+    fn aarch64_vabdq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vabdq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vabdq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vabdq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vabdq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vabdq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vabdq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vabdq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vmax_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vmax_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vmax_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vmax_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vmax_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vmax_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vmax_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vmax_f64(x: f64x1, y: f64x1) -> f64x1;
+    fn aarch64_vmaxq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vmaxq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vmaxq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vmaxq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vmaxq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vmaxq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vmaxq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vmaxq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vmin_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vmin_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vmin_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vmin_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vmin_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vmin_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vmin_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vmin_f64(x: f64x1, y: f64x1) -> f64x1;
+    fn aarch64_vminq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vminq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vminq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vminq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vminq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vminq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vminq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vminq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vmaxnm_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vmaxnm_f64(x: f64x1, y: f64x1) -> f64x1;
+    fn aarch64_vmaxnmq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vmaxnmq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vminnm_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vminnm_f64(x: f64x1, y: f64x1) -> f64x1;
+    fn aarch64_vminnmq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vminnmq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vshl_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vshl_u8(x: u8x8, y: i8x8) -> u8x8;
+    fn aarch64_vshl_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vshl_u16(x: u16x4, y: i16x4) -> u16x4;
+    fn aarch64_vshl_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vshl_u32(x: u32x2, y: i32x2) -> u32x2;
+    fn aarch64_vshl_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn aarch64_vshl_u64(x: u64x1, y: i64x1) -> u64x1;
+    fn aarch64_vshlq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vshlq_u8(x: u8x16, y: i8x16) -> u8x16;
+    fn aarch64_vshlq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vshlq_u16(x: u16x8, y: i16x8) -> u16x8;
+    fn aarch64_vshlq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vshlq_u32(x: u32x4, y: i32x4) -> u32x4;
+    fn aarch64_vshlq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vshlq_u64(x: u64x2, y: i64x2) -> u64x2;
+    fn aarch64_vqshl_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vqshl_u8(x: u8x8, y: i8x8) -> u8x8;
+    fn aarch64_vqshl_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vqshl_u16(x: u16x4, y: i16x4) -> u16x4;
+    fn aarch64_vqshl_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vqshl_u32(x: u32x2, y: i32x2) -> u32x2;
+    fn aarch64_vqshl_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn aarch64_vqshl_u64(x: u64x1, y: i64x1) -> u64x1;
+    fn aarch64_vqshlq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vqshlq_u8(x: u8x16, y: i8x16) -> u8x16;
+    fn aarch64_vqshlq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vqshlq_u16(x: u16x8, y: i16x8) -> u16x8;
+    fn aarch64_vqshlq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vqshlq_u32(x: u32x4, y: i32x4) -> u32x4;
+    fn aarch64_vqshlq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vqshlq_u64(x: u64x2, y: i64x2) -> u64x2;
+    fn aarch64_vrshl_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vrshl_u8(x: u8x8, y: i8x8) -> u8x8;
+    fn aarch64_vrshl_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vrshl_u16(x: u16x4, y: i16x4) -> u16x4;
+    fn aarch64_vrshl_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vrshl_u32(x: u32x2, y: i32x2) -> u32x2;
+    fn aarch64_vrshl_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn aarch64_vrshl_u64(x: u64x1, y: i64x1) -> u64x1;
+    fn aarch64_vrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
+    fn aarch64_vrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
+    fn aarch64_vrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
+    fn aarch64_vrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
+    fn aarch64_vqrshl_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vqrshl_u8(x: u8x8, y: i8x8) -> u8x8;
+    fn aarch64_vqrshl_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vqrshl_u16(x: u16x4, y: i16x4) -> u16x4;
+    fn aarch64_vqrshl_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vqrshl_u32(x: u32x2, y: i32x2) -> u32x2;
+    fn aarch64_vqrshl_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn aarch64_vqrshl_u64(x: u64x1, y: i64x1) -> u64x1;
+    fn aarch64_vqrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vqrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
+    fn aarch64_vqrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vqrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
+    fn aarch64_vqrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vqrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
+    fn aarch64_vqrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vqrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
+    fn aarch64_vqshrun_n_s16(x: i16x8, y: u32) -> i8x8;
+    fn aarch64_vqshrun_n_s32(x: i32x4, y: u32) -> i16x4;
+    fn aarch64_vqshrun_n_s64(x: i64x2, y: u32) -> i32x2;
+    fn aarch64_vqrshrun_n_s16(x: i16x8, y: u32) -> i8x8;
+    fn aarch64_vqrshrun_n_s32(x: i32x4, y: u32) -> i16x4;
+    fn aarch64_vqrshrun_n_s64(x: i64x2, y: u32) -> i32x2;
+    fn aarch64_vqshrn_n_s16(x: i16x8, y: u32) -> i8x8;
+    fn aarch64_vqshrn_n_u16(x: u16x8, y: u32) -> u8x8;
+    fn aarch64_vqshrn_n_s32(x: i32x4, y: u32) -> i16x4;
+    fn aarch64_vqshrn_n_u32(x: u32x4, y: u32) -> u16x4;
+    fn aarch64_vqshrn_n_s64(x: i64x2, y: u32) -> i32x2;
+    fn aarch64_vqshrn_n_u64(x: u64x2, y: u32) -> u32x2;
+    fn aarch64_vrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
+    fn aarch64_vrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
+    fn aarch64_vrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
+    fn aarch64_vrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
+    fn aarch64_vrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
+    fn aarch64_vrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
+    fn aarch64_vqrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
+    fn aarch64_vqrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
+    fn aarch64_vqrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
+    fn aarch64_vqrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
+    fn aarch64_vqrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
+    fn aarch64_vqrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
+    fn aarch64_vsri_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vsri_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vsri_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vsri_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vsri_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vsri_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vsri_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn aarch64_vsri_u64(x: u64x1, y: u64x1) -> u64x1;
+    fn aarch64_vsriq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vsriq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vsriq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vsriq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vsriq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vsriq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vsriq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vsriq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn aarch64_vsli_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vsli_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vsli_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vsli_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vsli_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vsli_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vsli_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn aarch64_vsli_u64(x: u64x1, y: u64x1) -> u64x1;
+    fn aarch64_vsliq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vsliq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vsliq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vsliq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vsliq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vsliq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vsliq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vsliq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn aarch64_vvqmovn_s16(x: i16x8) -> i8x8;
+    fn aarch64_vvqmovn_u16(x: u16x8) -> u8x8;
+    fn aarch64_vvqmovn_s32(x: i32x4) -> i16x4;
+    fn aarch64_vvqmovn_u32(x: u32x4) -> u16x4;
+    fn aarch64_vvqmovn_s64(x: i64x2) -> i32x2;
+    fn aarch64_vvqmovn_u64(x: u64x2) -> u32x2;
+    fn aarch64_vabs_s8(x: i8x8) -> i8x8;
+    fn aarch64_vabs_s16(x: i16x4) -> i16x4;
+    fn aarch64_vabs_s32(x: i32x2) -> i32x2;
+    fn aarch64_vabs_s64(x: i64x1) -> i64x1;
+    fn aarch64_vabsq_s8(x: i8x16) -> i8x16;
+    fn aarch64_vabsq_s16(x: i16x8) -> i16x8;
+    fn aarch64_vabsq_s32(x: i32x4) -> i32x4;
+    fn aarch64_vabsq_s64(x: i64x2) -> i64x2;
+    fn aarch64_vabs_f32(x: f32x2) -> f32x2;
+    fn aarch64_vabs_f64(x: f64x1) -> f64x1;
+    fn aarch64_vabsq_f32(x: f32x4) -> f32x4;
+    fn aarch64_vabsq_f64(x: f64x2) -> f64x2;
+    fn aarch64_vqabs_s8(x: i8x8) -> i8x8;
+    fn aarch64_vqabs_s16(x: i16x4) -> i16x4;
+    fn aarch64_vqabs_s32(x: i32x2) -> i32x2;
+    fn aarch64_vqabs_s64(x: i64x1) -> i64x1;
+    fn aarch64_vqabsq_s8(x: i8x16) -> i8x16;
+    fn aarch64_vqabsq_s16(x: i16x8) -> i16x8;
+    fn aarch64_vqabsq_s32(x: i32x4) -> i32x4;
+    fn aarch64_vqabsq_s64(x: i64x2) -> i64x2;
+    fn aarch64_vqneg_s8(x: i8x8) -> i8x8;
+    fn aarch64_vqneg_s16(x: i16x4) -> i16x4;
+    fn aarch64_vqneg_s32(x: i32x2) -> i32x2;
+    fn aarch64_vqneg_s64(x: i64x1) -> i64x1;
+    fn aarch64_vqnegq_s8(x: i8x16) -> i8x16;
+    fn aarch64_vqnegq_s16(x: i16x8) -> i16x8;
+    fn aarch64_vqnegq_s32(x: i32x4) -> i32x4;
+    fn aarch64_vqnegq_s64(x: i64x2) -> i64x2;
+    fn aarch64_vclz_s8(x: i8x8) -> i8x8;
+    fn aarch64_vclz_u8(x: u8x8) -> u8x8;
+    fn aarch64_vclz_s16(x: i16x4) -> i16x4;
+    fn aarch64_vclz_u16(x: u16x4) -> u16x4;
+    fn aarch64_vclz_s32(x: i32x2) -> i32x2;
+    fn aarch64_vclz_u32(x: u32x2) -> u32x2;
+    fn aarch64_vclzq_s8(x: i8x16) -> i8x16;
+    fn aarch64_vclzq_u8(x: u8x16) -> u8x16;
+    fn aarch64_vclzq_s16(x: i16x8) -> i16x8;
+    fn aarch64_vclzq_u16(x: u16x8) -> u16x8;
+    fn aarch64_vclzq_s32(x: i32x4) -> i32x4;
+    fn aarch64_vclzq_u32(x: u32x4) -> u32x4;
+    fn aarch64_vcls_s8(x: i8x8) -> i8x8;
+    fn aarch64_vcls_u8(x: u8x8) -> u8x8;
+    fn aarch64_vcls_s16(x: i16x4) -> i16x4;
+    fn aarch64_vcls_u16(x: u16x4) -> u16x4;
+    fn aarch64_vcls_s32(x: i32x2) -> i32x2;
+    fn aarch64_vcls_u32(x: u32x2) -> u32x2;
+    fn aarch64_vclsq_s8(x: i8x16) -> i8x16;
+    fn aarch64_vclsq_u8(x: u8x16) -> u8x16;
+    fn aarch64_vclsq_s16(x: i16x8) -> i16x8;
+    fn aarch64_vclsq_u16(x: u16x8) -> u16x8;
+    fn aarch64_vclsq_s32(x: i32x4) -> i32x4;
+    fn aarch64_vclsq_u32(x: u32x4) -> u32x4;
+    fn aarch64_vcnt_s8(x: i8x8) -> i8x8;
+    fn aarch64_vcnt_u8(x: u8x8) -> u8x8;
+    fn aarch64_vcntq_s8(x: i8x16) -> i8x16;
+    fn aarch64_vcntq_u8(x: u8x16) -> u8x16;
+    fn aarch64_vrecpe_u32(x: u32x2) -> u32x2;
+    fn aarch64_vrecpe_f32(x: f32x2) -> f32x2;
+    fn aarch64_vrecpe_f64(x: f64x1) -> f64x1;
+    fn aarch64_vrecpeq_u32(x: u32x4) -> u32x4;
+    fn aarch64_vrecpeq_f32(x: f32x4) -> f32x4;
+    fn aarch64_vrecpeq_f64(x: f64x2) -> f64x2;
+    fn aarch64_vrecps_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vrecps_f64(x: f64x1, y: f64x1) -> f64x1;
+    fn aarch64_vrecpsq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vrecpsq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vsqrt_f32(x: f32x2) -> f32x2;
+    fn aarch64_vsqrt_f64(x: f64x1) -> f64x1;
+    fn aarch64_vsqrtq_f32(x: f32x4) -> f32x4;
+    fn aarch64_vsqrtq_f64(x: f64x2) -> f64x2;
+    fn aarch64_vrsqrte_u32(x: u32x2) -> u32x2;
+    fn aarch64_vrsqrte_f32(x: f32x2) -> f32x2;
+    fn aarch64_vrsqrte_f64(x: f64x1) -> f64x1;
+    fn aarch64_vrsqrteq_u32(x: u32x4) -> u32x4;
+    fn aarch64_vrsqrteq_f32(x: f32x4) -> f32x4;
+    fn aarch64_vrsqrteq_f64(x: f64x2) -> f64x2;
+    fn aarch64_vrsqrts_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vrsqrts_f64(x: f64x1, y: f64x1) -> f64x1;
+    fn aarch64_vrsqrtsq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vrsqrtsq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vrbit_s8(x: i8x8) -> i8x8;
+    fn aarch64_vrbit_u8(x: u8x8) -> u8x8;
+    fn aarch64_vrbitq_s8(x: i8x16) -> i8x16;
+    fn aarch64_vrbitq_u8(x: u8x16) -> u8x16;
+    fn aarch64_vpadd_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vpadd_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vpadd_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vpadd_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vpadd_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vpadd_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vpadd_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vpaddq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vpaddq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vpaddq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vpaddq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vpaddq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vpaddq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vpaddq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vpaddq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vpaddq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn aarch64_vpaddq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vpaddl_s16(x: i8x8) -> i16x4;
+    fn aarch64_vpaddl_u16(x: u8x8) -> u16x4;
+    fn aarch64_vpaddl_s32(x: i16x4) -> i32x2;
+    fn aarch64_vpaddl_u32(x: u16x4) -> u32x2;
+    fn aarch64_vpaddl_s64(x: i32x2) -> i64x1;
+    fn aarch64_vpaddl_u64(x: u32x2) -> u64x1;
+    fn aarch64_vpaddlq_s16(x: i8x16) -> i16x8;
+    fn aarch64_vpaddlq_u16(x: u8x16) -> u16x8;
+    fn aarch64_vpaddlq_s32(x: i16x8) -> i32x4;
+    fn aarch64_vpaddlq_u32(x: u16x8) -> u32x4;
+    fn aarch64_vpaddlq_s64(x: i32x4) -> i64x2;
+    fn aarch64_vpaddlq_u64(x: u32x4) -> u64x2;
+    fn aarch64_vpmax_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vpmax_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vpmax_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vpmax_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vpmax_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vpmax_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vpmax_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vpmaxq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vpmaxq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vpmaxq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vpmaxq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vpmaxq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vpmaxq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vpmaxq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vpmaxq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vpmaxq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn aarch64_vpmaxq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vpmin_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vpmin_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vpmin_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vpmin_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vpmin_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vpmin_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vpmin_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vpminq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vpminq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vpminq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vpminq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vpminq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vpminq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vpminq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vpminq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vpminq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn aarch64_vpminq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vpmaxnm_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn aarch64_vpmaxnm_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn aarch64_vpmaxnm_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn aarch64_vpmaxnm_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn aarch64_vpmaxnm_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn aarch64_vpmaxnm_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn aarch64_vpmaxnm_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vpmaxnmq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn aarch64_vpmaxnmq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vpmaxnmq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn aarch64_vpmaxnmq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn aarch64_vpmaxnmq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn aarch64_vpmaxnmq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn aarch64_vpmaxnmq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vpmaxnmq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn aarch64_vpmaxnmq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn aarch64_vpmaxnmq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vpminnm_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn aarch64_vpminnmq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn aarch64_vpminnmq_f64(x: f64x2, y: f64x2) -> f64x2;
+    fn aarch64_vaddv_s8(x: i8x8) -> i8;
+    fn aarch64_vaddv_u8(x: u8x8) -> u8;
+    fn aarch64_vaddv_s16(x: i16x4) -> i16;
+    fn aarch64_vaddv_u16(x: u16x4) -> u16;
+    fn aarch64_vaddv_s32(x: i32x2) -> i32;
+    fn aarch64_vaddv_u32(x: u32x2) -> u32;
+    fn aarch64_vaddv_f32(x: f32x2) -> f32;
+    fn aarch64_vaddvq_s8(x: i8x16) -> i8;
+    fn aarch64_vaddvq_u8(x: u8x16) -> u8;
+    fn aarch64_vaddvq_s16(x: i16x8) -> i16;
+    fn aarch64_vaddvq_u16(x: u16x8) -> u16;
+    fn aarch64_vaddvq_s32(x: i32x4) -> i32;
+    fn aarch64_vaddvq_u32(x: u32x4) -> u32;
+    fn aarch64_vaddvq_f32(x: f32x4) -> f32;
+    fn aarch64_vaddvq_s64(x: i64x2) -> i64;
+    fn aarch64_vaddvq_u64(x: u64x2) -> u64;
+    fn aarch64_vaddvq_f64(x: f64x2) -> f64;
+    fn aarch64_vaddlv_s8(x: i8x8) -> i16;
+    fn aarch64_vaddlv_u8(x: u8x8) -> u16;
+    fn aarch64_vaddlv_s16(x: i16x4) -> i32;
+    fn aarch64_vaddlv_u16(x: u16x4) -> u32;
+    fn aarch64_vaddlv_s32(x: i32x2) -> i64;
+    fn aarch64_vaddlv_u32(x: u32x2) -> u64;
+    fn aarch64_vaddlvq_s8(x: i8x16) -> i16;
+    fn aarch64_vaddlvq_u8(x: u8x16) -> u16;
+    fn aarch64_vaddlvq_s16(x: i16x8) -> i32;
+    fn aarch64_vaddlvq_u16(x: u16x8) -> u32;
+    fn aarch64_vaddlvq_s32(x: i32x4) -> i64;
+    fn aarch64_vaddlvq_u32(x: u32x4) -> u64;
+    fn aarch64_vmaxv_s8(x: i8x8) -> i8;
+    fn aarch64_vmaxv_u8(x: u8x8) -> u8;
+    fn aarch64_vmaxv_s16(x: i16x4) -> i16;
+    fn aarch64_vmaxv_u16(x: u16x4) -> u16;
+    fn aarch64_vmaxv_s32(x: i32x2) -> i32;
+    fn aarch64_vmaxv_u32(x: u32x2) -> u32;
+    fn aarch64_vmaxv_f32(x: f32x2) -> f32;
+    fn aarch64_vmaxvq_s8(x: i8x16) -> i8;
+    fn aarch64_vmaxvq_u8(x: u8x16) -> u8;
+    fn aarch64_vmaxvq_s16(x: i16x8) -> i16;
+    fn aarch64_vmaxvq_u16(x: u16x8) -> u16;
+    fn aarch64_vmaxvq_s32(x: i32x4) -> i32;
+    fn aarch64_vmaxvq_u32(x: u32x4) -> u32;
+    fn aarch64_vmaxvq_f32(x: f32x4) -> f32;
+    fn aarch64_vmaxvq_f64(x: f64x2) -> f64;
+    fn aarch64_vminv_s8(x: i8x8) -> i8;
+    fn aarch64_vminv_u8(x: u8x8) -> u8;
+    fn aarch64_vminv_s16(x: i16x4) -> i16;
+    fn aarch64_vminv_u16(x: u16x4) -> u16;
+    fn aarch64_vminv_s32(x: i32x2) -> i32;
+    fn aarch64_vminv_u32(x: u32x2) -> u32;
+    fn aarch64_vminv_f32(x: f32x2) -> f32;
+    fn aarch64_vminvq_s8(x: i8x16) -> i8;
+    fn aarch64_vminvq_u8(x: u8x16) -> u8;
+    fn aarch64_vminvq_s16(x: i16x8) -> i16;
+    fn aarch64_vminvq_u16(x: u16x8) -> u16;
+    fn aarch64_vminvq_s32(x: i32x4) -> i32;
+    fn aarch64_vminvq_u32(x: u32x4) -> u32;
+    fn aarch64_vminvq_f32(x: f32x4) -> f32;
+    fn aarch64_vminvq_f64(x: f64x2) -> f64;
+    fn aarch64_vmaxnmv_f32(x: f32x2) -> f32;
+    fn aarch64_vmaxnmvq_f32(x: f32x4) -> f32;
+    fn aarch64_vmaxnmvq_f64(x: f64x2) -> f64;
+    fn aarch64_vminnmv_f32(x: f32x2) -> f32;
+    fn aarch64_vminnmvq_f32(x: f32x4) -> f32;
+    fn aarch64_vminnmvq_f64(x: f64x2) -> f64;
+    fn aarch64_vqtbl1_s8(x: i8x16, y: u8x8) -> i8x8;
+    fn aarch64_vqtbl1_u8(x: u8x16, y: u8x8) -> u8x8;
+    fn aarch64_vqtbl1q_s8(x: i8x16, y: u8x16) -> i8x16;
+    fn aarch64_vqtbl1q_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn aarch64_vqtbx1_s8(x: i8x8, y: i8x16, z: u8x8) -> i8x8;
+    fn aarch64_vqtbx1_u8(x: u8x8, y: u8x16, z: u8x8) -> u8x8;
+    fn aarch64_vqtbx1q_s8(x: i8x16, y: i8x16, z: u8x16) -> i8x16;
+    fn aarch64_vqtbx1q_u8(x: u8x16, y: u8x16, z: u8x16) -> u8x16;
+    fn aarch64_vqtbl2_s8(x: (i8x16, i8x16), y: u8x8) -> i8x8;
+    fn aarch64_vqtbl2_u8(x: (u8x16, u8x16), y: u8x8) -> u8x8;
+    fn aarch64_vqtbl2q_s8(x: (i8x16, i8x16), y: u8x16) -> i8x16;
+    fn aarch64_vqtbl2q_u8(x: (u8x16, u8x16), y: u8x16) -> u8x16;
+    fn aarch64_vqtbx2_s8(x: (i8x16, i8x16), y: u8x8) -> i8x8;
+    fn aarch64_vqtbx2_u8(x: (u8x16, u8x16), y: u8x8) -> u8x8;
+    fn aarch64_vqtbx2q_s8(x: (i8x16, i8x16), y: u8x16) -> i8x16;
+    fn aarch64_vqtbx2q_u8(x: (u8x16, u8x16), y: u8x16) -> u8x16;
+    fn aarch64_vqtbl3_s8(x: (i8x16, i8x16, i8x16), y: u8x8) -> i8x8;
+    fn aarch64_vqtbl3_u8(x: (u8x16, u8x16, u8x16), y: u8x8) -> u8x8;
+    fn aarch64_vqtbl3q_s8(x: (i8x16, i8x16, i8x16), y: u8x16) -> i8x16;
+    fn aarch64_vqtbl3q_u8(x: (u8x16, u8x16, u8x16), y: u8x16) -> u8x16;
+    fn aarch64_vqtbx3_s8(x: i8x8, y: (i8x16, i8x16, i8x16), z: u8x8) -> i8x8;
+    fn aarch64_vqtbx3_u8(x: u8x8, y: (u8x16, u8x16, u8x16), z: u8x8) -> u8x8;
+    fn aarch64_vqtbx3q_s8(x: i8x16, y: (i8x16, i8x16, i8x16), z: u8x16) -> i8x16;
+    fn aarch64_vqtbx3q_u8(x: u8x16, y: (u8x16, u8x16, u8x16), z: u8x16) -> u8x16;
+    fn aarch64_vqtbl4_s8(x: (i8x16, i8x16, i8x16, i8x16), y: u8x8) -> i8x8;
+    fn aarch64_vqtbl4_u8(x: (u8x16, u8x16, u8x16, u8x16), y: u8x8) -> u8x8;
+    fn aarch64_vqtbl4q_s8(x: (i8x16, i8x16, i8x16, i8x16), y: u8x16) -> i8x16;
+    fn aarch64_vqtbl4q_u8(x: (u8x16, u8x16, u8x16, u8x16), y: u8x16) -> u8x16;
+    fn aarch64_vqtbx4_s8(x: i8x8, y: (i8x16, i8x16, i8x16, i8x16), z: u8x8) -> i8x8;
+    fn aarch64_vqtbx4_u8(x: u8x8, y: (u8x16, u8x16, u8x16, u8x16), z: u8x8) -> u8x8;
+    fn aarch64_vqtbx4q_s8(x: i8x16, y: (i8x16, i8x16, i8x16, i8x16), z: u8x16) -> i8x16;
+    fn aarch64_vqtbx4q_u8(x: u8x16, y: (u8x16, u8x16, u8x16, u8x16), z: u8x16) -> u8x16;
+}
+
+pub trait Aarch64F32x4 {
+    fn to_f64(self) -> f64x2;
+}
+impl Aarch64F32x4 for f32x4 {
+    #[inline]
+    fn to_f64(self) -> f64x2 {
+        unsafe {
+            simd_cast(f32x2(self.0, self.1))
+        }
+    }
+}
+
+pub trait Aarch64U8x16 {
+    fn table_lookup_1(self, t0: u8x16) -> u8x16;
+}
+impl Aarch64U8x16 for u8x16 {
+    #[inline]
+    fn table_lookup_1(self, t0: u8x16) -> u8x16 {
+        unsafe {aarch64_vqtbl1q_u8(t0, self)}
+    }
+}
+pub trait Aarch64I8x16 {
+    fn table_lookup_1(self, t0: i8x16) -> i8x16;
+}
+impl Aarch64I8x16 for i8x16 {
+    #[inline]
+    fn table_lookup_1(self, t0: i8x16) -> i8x16 {
+        unsafe {aarch64_vqtbl2q_s8((t0, t0), ::bitcast(self))}
+    }
+}
+
+#[doc(hidden)]
+pub mod common {
+    use super::super::super::*;
+    use std::mem;
+
+    #[inline]
+    pub fn f32x4_sqrt(x: f32x4) -> f32x4 {
+        unsafe {super::aarch64_vsqrtq_f32(x)}
+    }
+    #[inline]
+    pub fn f32x4_approx_rsqrt(x: f32x4) -> f32x4 {
+        unsafe {super::aarch64_vrsqrteq_f32(x)}
+    }
+    #[inline]
+    pub fn f32x4_approx_reciprocal(x: f32x4) -> f32x4 {
+        unsafe {super::aarch64_vrecpeq_f32(x)}
+    }
+    #[inline]
+    pub fn f32x4_max(x: f32x4, y: f32x4) -> f32x4 {
+        unsafe {super::aarch64_vmaxq_f32(x, y)}
+    }
+    #[inline]
+    pub fn f32x4_min(x: f32x4, y: f32x4) -> f32x4 {
+        unsafe {super::aarch64_vminq_f32(x, y)}
+    }
+
+    macro_rules! bools {
+        ($($ty: ty, $all: ident ($min: ident), $any: ident ($max: ident);)*) => {
+            $(
+                #[inline]
+                pub fn $all(x: $ty) -> bool {
+                    unsafe {
+                        super::$min(mem::transmute(x)) != 0
+                    }
+                }
+                #[inline]
+                pub fn $any(x: $ty) -> bool {
+                    unsafe {
+                        super::$max(mem::transmute(x)) != 0
+                    }
+                }
+                )*
+        }
+    }
+
+    bools! {
+        bool32fx4, bool32fx4_all(aarch64_vminvq_u32), bool32fx4_any(aarch64_vmaxvq_u32);
+        bool8ix16, bool8ix16_all(aarch64_vminvq_u8), bool8ix16_any(aarch64_vmaxvq_u8);
+        bool16ix8, bool16ix8_all(aarch64_vminvq_u16), bool16ix8_any(aarch64_vmaxvq_u16);
+        bool32ix4, bool32ix4_all(aarch64_vminvq_u32), bool32ix4_any(aarch64_vmaxvq_u32);
+    }
+}
--- a/third_party/rust/simd/src/arm/mod.rs
+++ b/third_party/rust/simd/src/arm/mod.rs
@ -0,0 +1,4 @@
+//! Features specific to ARM CPUs.
+
+#[cfg(any(feature = "doc", target_feature = "neon"))]
+pub mod neon;
--- a/third_party/rust/simd/src/arm/neon.rs
+++ b/third_party/rust/simd/src/arm/neon.rs
@ -0,0 +1,530 @@
+use super::super::*;
+use sixty_four::{i64x2, u64x2};
+
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct u32x2(u32, u32);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct i32x2(i32, i32);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct f32x2(f32, f32);
+
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct u16x4(u16, u16, u16, u16);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct i16x4(i16, i16, i16, i16);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct u8x8(u8, u8, u8, u8,
+            u8, u8, u8, u8);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct i8x8(i8, i8, i8, i8,
+                i8, i8, i8, i8);
+
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct i64x1(i64);
+#[repr(simd)]
+#[derive(Copy, Clone)]
+pub struct u64x1(u64);
+
+#[allow(dead_code)]
+extern "platform-intrinsic" {
+    fn arm_vhadd_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vhadd_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vhadd_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vhadd_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vhadd_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vhadd_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vrhadd_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vrhadd_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vrhadd_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vrhadd_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vrhadd_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vrhadd_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vrhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vrhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vrhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vrhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vrhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vrhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vqadd_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vqadd_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vqadd_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vqadd_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vqadd_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vqadd_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vqadd_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn arm_vqadd_u64(x: u64x1, y: u64x1) -> u64x1;
+    fn arm_vqaddq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vqaddq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vqaddq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vqaddq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vqaddq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vqaddq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vqaddq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn arm_vqaddq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn arm_vraddhn_s16(x: i16x8, y: i16x8) -> i8x8;
+    fn arm_vraddhn_u16(x: u16x8, y: u16x8) -> u8x8;
+    fn arm_vraddhn_s32(x: i32x4, y: i32x4) -> i16x4;
+    fn arm_vraddhn_u32(x: u32x4, y: u32x4) -> u16x4;
+    fn arm_vraddhn_s64(x: i64x2, y: i64x2) -> i32x2;
+    fn arm_vraddhn_u64(x: u64x2, y: u64x2) -> u32x2;
+    fn arm_vfma_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn arm_vfmaq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn arm_vqdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vqdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vqdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vqdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vqrdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vqrdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vqrdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vqrdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vmull_s8(x: i8x8, y: i8x8) -> i16x8;
+    fn arm_vmull_u8(x: u8x8, y: u8x8) -> u16x8;
+    fn arm_vmull_s16(x: i16x4, y: i16x4) -> i32x4;
+    fn arm_vmull_u16(x: u16x4, y: u16x4) -> u32x4;
+    fn arm_vmull_s32(x: i32x2, y: i32x2) -> i64x2;
+    fn arm_vmull_u32(x: u32x2, y: u32x2) -> u64x2;
+    fn arm_vqdmullq_s8(x: i8x8, y: i8x8) -> i16x8;
+    fn arm_vqdmullq_s16(x: i16x4, y: i16x4) -> i32x4;
+    fn arm_vhsub_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vhsub_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vhsub_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vhsub_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vhsub_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vhsub_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vhsubq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vhsubq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vhsubq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vhsubq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vhsubq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vhsubq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vqsub_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vqsub_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vqsub_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vqsub_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vqsub_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vqsub_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vqsub_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn arm_vqsub_u64(x: u64x1, y: u64x1) -> u64x1;
+    fn arm_vqsubq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vqsubq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vqsubq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vqsubq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vqsubq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vqsubq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vqsubq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn arm_vqsubq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn arm_vrsubhn_s16(x: i16x8, y: i16x8) -> i8x8;
+    fn arm_vrsubhn_u16(x: u16x8, y: u16x8) -> u8x8;
+    fn arm_vrsubhn_s32(x: i32x4, y: i32x4) -> i16x4;
+    fn arm_vrsubhn_u32(x: u32x4, y: u32x4) -> u16x4;
+    fn arm_vrsubhn_s64(x: i64x2, y: i64x2) -> i32x2;
+    fn arm_vrsubhn_u64(x: u64x2, y: u64x2) -> u32x2;
+    fn arm_vabd_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vabd_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vabd_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vabd_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vabd_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vabd_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vabd_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn arm_vabdq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vabdq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vabdq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vabdq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vabdq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vabdq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vabdq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn arm_vmax_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vmax_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vmax_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vmax_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vmax_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vmax_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vmax_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn arm_vmaxq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vmaxq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vmaxq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vmaxq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vmaxq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vmaxq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vmaxq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn arm_vmin_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vmin_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vmin_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vmin_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vmin_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vmin_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vmin_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn arm_vminq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vminq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vminq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vminq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vminq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vminq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vminq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn arm_vshl_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vshl_u8(x: u8x8, y: i8x8) -> u8x8;
+    fn arm_vshl_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vshl_u16(x: u16x4, y: i16x4) -> u16x4;
+    fn arm_vshl_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vshl_u32(x: u32x2, y: i32x2) -> u32x2;
+    fn arm_vshl_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn arm_vshl_u64(x: u64x1, y: i64x1) -> u64x1;
+    fn arm_vshlq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vshlq_u8(x: u8x16, y: i8x16) -> u8x16;
+    fn arm_vshlq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vshlq_u16(x: u16x8, y: i16x8) -> u16x8;
+    fn arm_vshlq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vshlq_u32(x: u32x4, y: i32x4) -> u32x4;
+    fn arm_vshlq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn arm_vshlq_u64(x: u64x2, y: i64x2) -> u64x2;
+    fn arm_vqshl_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vqshl_u8(x: u8x8, y: i8x8) -> u8x8;
+    fn arm_vqshl_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vqshl_u16(x: u16x4, y: i16x4) -> u16x4;
+    fn arm_vqshl_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vqshl_u32(x: u32x2, y: i32x2) -> u32x2;
+    fn arm_vqshl_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn arm_vqshl_u64(x: u64x1, y: i64x1) -> u64x1;
+    fn arm_vqshlq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vqshlq_u8(x: u8x16, y: i8x16) -> u8x16;
+    fn arm_vqshlq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vqshlq_u16(x: u16x8, y: i16x8) -> u16x8;
+    fn arm_vqshlq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vqshlq_u32(x: u32x4, y: i32x4) -> u32x4;
+    fn arm_vqshlq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn arm_vqshlq_u64(x: u64x2, y: i64x2) -> u64x2;
+    fn arm_vrshl_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vrshl_u8(x: u8x8, y: i8x8) -> u8x8;
+    fn arm_vrshl_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vrshl_u16(x: u16x4, y: i16x4) -> u16x4;
+    fn arm_vrshl_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vrshl_u32(x: u32x2, y: i32x2) -> u32x2;
+    fn arm_vrshl_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn arm_vrshl_u64(x: u64x1, y: i64x1) -> u64x1;
+    fn arm_vrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
+    fn arm_vrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
+    fn arm_vrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
+    fn arm_vrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn arm_vrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
+    fn arm_vqrshl_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vqrshl_u8(x: u8x8, y: i8x8) -> u8x8;
+    fn arm_vqrshl_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vqrshl_u16(x: u16x4, y: i16x4) -> u16x4;
+    fn arm_vqrshl_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vqrshl_u32(x: u32x2, y: i32x2) -> u32x2;
+    fn arm_vqrshl_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn arm_vqrshl_u64(x: u64x1, y: i64x1) -> u64x1;
+    fn arm_vqrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vqrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
+    fn arm_vqrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vqrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
+    fn arm_vqrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vqrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
+    fn arm_vqrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn arm_vqrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
+    fn arm_vqshrun_n_s16(x: i16x8, y: u32) -> i8x8;
+    fn arm_vqshrun_n_s32(x: i32x4, y: u32) -> i16x4;
+    fn arm_vqshrun_n_s64(x: i64x2, y: u32) -> i32x2;
+    fn arm_vqrshrun_n_s16(x: i16x8, y: u32) -> i8x8;
+    fn arm_vqrshrun_n_s32(x: i32x4, y: u32) -> i16x4;
+    fn arm_vqrshrun_n_s64(x: i64x2, y: u32) -> i32x2;
+    fn arm_vqshrn_n_s16(x: i16x8, y: u32) -> i8x8;
+    fn arm_vqshrn_n_u16(x: u16x8, y: u32) -> u8x8;
+    fn arm_vqshrn_n_s32(x: i32x4, y: u32) -> i16x4;
+    fn arm_vqshrn_n_u32(x: u32x4, y: u32) -> u16x4;
+    fn arm_vqshrn_n_s64(x: i64x2, y: u32) -> i32x2;
+    fn arm_vqshrn_n_u64(x: u64x2, y: u32) -> u32x2;
+    fn arm_vrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
+    fn arm_vrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
+    fn arm_vrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
+    fn arm_vrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
+    fn arm_vrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
+    fn arm_vrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
+    fn arm_vqrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
+    fn arm_vqrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
+    fn arm_vqrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
+    fn arm_vqrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
+    fn arm_vqrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
+    fn arm_vqrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
+    fn arm_vsri_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vsri_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vsri_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vsri_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vsri_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vsri_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vsri_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn arm_vsri_u64(x: u64x1, y: u64x1) -> u64x1;
+    fn arm_vsriq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vsriq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vsriq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vsriq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vsriq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vsriq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vsriq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn arm_vsriq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn arm_vsli_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vsli_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vsli_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vsli_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vsli_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vsli_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vsli_s64(x: i64x1, y: i64x1) -> i64x1;
+    fn arm_vsli_u64(x: u64x1, y: u64x1) -> u64x1;
+    fn arm_vsliq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vsliq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vsliq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vsliq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vsliq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vsliq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vsliq_s64(x: i64x2, y: i64x2) -> i64x2;
+    fn arm_vsliq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn arm_vvqmovn_s16(x: i16x8) -> i8x8;
+    fn arm_vvqmovn_u16(x: u16x8) -> u8x8;
+    fn arm_vvqmovn_s32(x: i32x4) -> i16x4;
+    fn arm_vvqmovn_u32(x: u32x4) -> u16x4;
+    fn arm_vvqmovn_s64(x: i64x2) -> i32x2;
+    fn arm_vvqmovn_u64(x: u64x2) -> u32x2;
+    fn arm_vabs_s8(x: i8x8) -> i8x8;
+    fn arm_vabs_s16(x: i16x4) -> i16x4;
+    fn arm_vabs_s32(x: i32x2) -> i32x2;
+    fn arm_vabsq_s8(x: i8x16) -> i8x16;
+    fn arm_vabsq_s16(x: i16x8) -> i16x8;
+    fn arm_vabsq_s32(x: i32x4) -> i32x4;
+    fn arm_vabs_f32(x: f32x2) -> f32x2;
+    fn arm_vabsq_f32(x: f32x4) -> f32x4;
+    fn arm_vqabs_s8(x: i8x8) -> i8x8;
+    fn arm_vqabs_s16(x: i16x4) -> i16x4;
+    fn arm_vqabs_s32(x: i32x2) -> i32x2;
+    fn arm_vqabsq_s8(x: i8x16) -> i8x16;
+    fn arm_vqabsq_s16(x: i16x8) -> i16x8;
+    fn arm_vqabsq_s32(x: i32x4) -> i32x4;
+    fn arm_vqneg_s8(x: i8x8) -> i8x8;
+    fn arm_vqneg_s16(x: i16x4) -> i16x4;
+    fn arm_vqneg_s32(x: i32x2) -> i32x2;
+    fn arm_vqnegq_s8(x: i8x16) -> i8x16;
+    fn arm_vqnegq_s16(x: i16x8) -> i16x8;
+    fn arm_vqnegq_s32(x: i32x4) -> i32x4;
+    fn arm_vclz_s8(x: i8x8) -> i8x8;
+    fn arm_vclz_u8(x: u8x8) -> u8x8;
+    fn arm_vclz_s16(x: i16x4) -> i16x4;
+    fn arm_vclz_u16(x: u16x4) -> u16x4;
+    fn arm_vclz_s32(x: i32x2) -> i32x2;
+    fn arm_vclz_u32(x: u32x2) -> u32x2;
+    fn arm_vclzq_s8(x: i8x16) -> i8x16;
+    fn arm_vclzq_u8(x: u8x16) -> u8x16;
+    fn arm_vclzq_s16(x: i16x8) -> i16x8;
+    fn arm_vclzq_u16(x: u16x8) -> u16x8;
+    fn arm_vclzq_s32(x: i32x4) -> i32x4;
+    fn arm_vclzq_u32(x: u32x4) -> u32x4;
+    fn arm_vcls_s8(x: i8x8) -> i8x8;
+    fn arm_vcls_u8(x: u8x8) -> u8x8;
+    fn arm_vcls_s16(x: i16x4) -> i16x4;
+    fn arm_vcls_u16(x: u16x4) -> u16x4;
+    fn arm_vcls_s32(x: i32x2) -> i32x2;
+    fn arm_vcls_u32(x: u32x2) -> u32x2;
+    fn arm_vclsq_s8(x: i8x16) -> i8x16;
+    fn arm_vclsq_u8(x: u8x16) -> u8x16;
+    fn arm_vclsq_s16(x: i16x8) -> i16x8;
+    fn arm_vclsq_u16(x: u16x8) -> u16x8;
+    fn arm_vclsq_s32(x: i32x4) -> i32x4;
+    fn arm_vclsq_u32(x: u32x4) -> u32x4;
+    fn arm_vcnt_s8(x: i8x8) -> i8x8;
+    fn arm_vcnt_u8(x: u8x8) -> u8x8;
+    fn arm_vcntq_s8(x: i8x16) -> i8x16;
+    fn arm_vcntq_u8(x: u8x16) -> u8x16;
+    fn arm_vrecpe_u32(x: u32x2) -> u32x2;
+    fn arm_vrecpe_f32(x: f32x2) -> f32x2;
+    fn arm_vrecpeq_u32(x: u32x4) -> u32x4;
+    fn arm_vrecpeq_f32(x: f32x4) -> f32x4;
+    fn arm_vrecps_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn arm_vrecpsq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn arm_vsqrt_f32(x: f32x2) -> f32x2;
+    fn arm_vsqrtq_f32(x: f32x4) -> f32x4;
+    fn arm_vrsqrte_u32(x: u32x2) -> u32x2;
+    fn arm_vrsqrte_f32(x: f32x2) -> f32x2;
+    fn arm_vrsqrteq_u32(x: u32x4) -> u32x4;
+    fn arm_vrsqrteq_f32(x: f32x4) -> f32x4;
+    fn arm_vrsqrts_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn arm_vrsqrtsq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn arm_vbsl_s8(x: u8x8, y: i8x8) -> i8x8;
+    fn arm_vbsl_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vbsl_s16(x: u16x4, y: i16x4) -> i16x4;
+    fn arm_vbsl_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vbsl_s32(x: u32x2, y: i32x2) -> i32x2;
+    fn arm_vbsl_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vbsl_s64(x: u64x1, y: i64x1) -> i64x1;
+    fn arm_vbsl_u64(x: u64x1, y: u64x1) -> u64x1;
+    fn arm_vbslq_s8(x: u8x16, y: i8x16) -> i8x16;
+    fn arm_vbslq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vbslq_s16(x: u16x8, y: i16x8) -> i16x8;
+    fn arm_vbslq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vbslq_s32(x: u32x4, y: i32x4) -> i32x4;
+    fn arm_vbslq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vbslq_s64(x: u64x2, y: i64x2) -> i64x2;
+    fn arm_vbslq_u64(x: u64x2, y: u64x2) -> u64x2;
+    fn arm_vpadd_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vpadd_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vpadd_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vpadd_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vpadd_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vpadd_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vpadd_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn arm_vpaddl_s16(x: i8x8) -> i16x4;
+    fn arm_vpaddl_u16(x: u8x8) -> u16x4;
+    fn arm_vpaddl_s32(x: i16x4) -> i32x2;
+    fn arm_vpaddl_u32(x: u16x4) -> u32x2;
+    fn arm_vpaddl_s64(x: i32x2) -> i64x1;
+    fn arm_vpaddl_u64(x: u32x2) -> u64x1;
+    fn arm_vpaddlq_s16(x: i8x16) -> i16x8;
+    fn arm_vpaddlq_u16(x: u8x16) -> u16x8;
+    fn arm_vpaddlq_s32(x: i16x8) -> i32x4;
+    fn arm_vpaddlq_u32(x: u16x8) -> u32x4;
+    fn arm_vpaddlq_s64(x: i32x4) -> i64x2;
+    fn arm_vpaddlq_u64(x: u32x4) -> u64x2;
+    fn arm_vpadal_s16(x: i16x4, y: i8x8) -> i16x4;
+    fn arm_vpadal_u16(x: u16x4, y: u8x8) -> u16x4;
+    fn arm_vpadal_s32(x: i32x2, y: i16x4) -> i32x2;
+    fn arm_vpadal_u32(x: u32x2, y: u16x4) -> u32x2;
+    fn arm_vpadal_s64(x: i64x1, y: i32x2) -> i64x1;
+    fn arm_vpadal_u64(x: u64x1, y: u32x2) -> u64x1;
+    fn arm_vpadalq_s16(x: i16x8, y: i8x16) -> i16x8;
+    fn arm_vpadalq_u16(x: u16x8, y: u8x16) -> u16x8;
+    fn arm_vpadalq_s32(x: i32x4, y: i16x8) -> i32x4;
+    fn arm_vpadalq_u32(x: u32x4, y: u16x8) -> u32x4;
+    fn arm_vpadalq_s64(x: i64x2, y: i32x4) -> i64x2;
+    fn arm_vpadalq_u64(x: u64x2, y: u32x4) -> u64x2;
+    fn arm_vpmax_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vpmax_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vpmax_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vpmax_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vpmax_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vpmax_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vpmax_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn arm_vpmin_s8(x: i8x8, y: i8x8) -> i8x8;
+    fn arm_vpmin_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vpmin_s16(x: i16x4, y: i16x4) -> i16x4;
+    fn arm_vpmin_u16(x: u16x4, y: u16x4) -> u16x4;
+    fn arm_vpmin_s32(x: i32x2, y: i32x2) -> i32x2;
+    fn arm_vpmin_u32(x: u32x2, y: u32x2) -> u32x2;
+    fn arm_vpmin_f32(x: f32x2, y: f32x2) -> f32x2;
+    fn arm_vpminq_s8(x: i8x16, y: i8x16) -> i8x16;
+    fn arm_vpminq_u8(x: u8x16, y: u8x16) -> u8x16;
+    fn arm_vpminq_s16(x: i16x8, y: i16x8) -> i16x8;
+    fn arm_vpminq_u16(x: u16x8, y: u16x8) -> u16x8;
+    fn arm_vpminq_s32(x: i32x4, y: i32x4) -> i32x4;
+    fn arm_vpminq_u32(x: u32x4, y: u32x4) -> u32x4;
+    fn arm_vpminq_f32(x: f32x4, y: f32x4) -> f32x4;
+    fn arm_vtbl1_s8(x: i8x8, y: u8x8) -> i8x8;
+    fn arm_vtbl1_u8(x: u8x8, y: u8x8) -> u8x8;
+    fn arm_vtbx1_s8(x: i8x8, y: i8x8, z: u8x8) -> i8x8;
+    fn arm_vtbx1_u8(x: u8x8, y: u8x8, z: u8x8) -> u8x8;
+    fn arm_vtbl2_s8(x: (i8x8, i8x8), y: u8x8) -> i8x8;
+    fn arm_vtbl2_u8(x: (u8x8, u8x8), y: u8x8) -> u8x8;
+    fn arm_vtbx2_s8(x: (i8x8, i8x8), y: u8x8) -> i8x8;
+    fn arm_vtbx2_u8(x: (u8x8, u8x8), y: u8x8) -> u8x8;
+    fn arm_vtbl3_s8(x: (i8x8, i8x8, i8x8), y: u8x8) -> i8x8;
+    fn arm_vtbl3_u8(x: (u8x8, u8x8, u8x8), y: u8x8) -> u8x8;
+    fn arm_vtbx3_s8(x: i8x8, y: (i8x8, i8x8, i8x8), z: u8x8) -> i8x8;
+    fn arm_vtbx3_u8(x: u8x8, y: (u8x8, u8x8, u8x8), z: u8x8) -> u8x8;
+    fn arm_vtbl4_s8(x: (i8x8, i8x8, i8x8, i8x8), y: u8x8) -> i8x8;
+    fn arm_vtbl4_u8(x: (u8x8, u8x8, u8x8, u8x8), y: u8x8) -> u8x8;
+    fn arm_vtbx4_s8(x: i8x8, y: (i8x8, i8x8, i8x8, i8x8), z: u8x8) -> i8x8;
+    fn arm_vtbx4_u8(x: u8x8, y: (u8x8, u8x8, u8x8, u8x8), z: u8x8) -> u8x8;
+}
+
+
+impl u8x8 {
+    #[inline]
+    pub fn table_lookup_1(self, t0: u8x8) -> u8x8 {
+        unsafe {arm_vtbl1_u8(t0, self)}
+    }
+    #[inline]
+    pub fn table_lookup_2(self, t0: u8x8, t1: u8x8) -> u8x8 {
+        unsafe {arm_vtbl2_u8((t0, t1), self)}
+    }
+    #[inline]
+    pub fn table_lookup_3(self, t0: u8x8, t1: u8x8, t2: u8x8) -> u8x8 {
+        unsafe {arm_vtbl3_u8((t0, t1, t2), self)}
+    }
+    #[inline]
+    pub fn table_lookup_4(self, t0: u8x8, t1: u8x8, t2: u8x8, t3: u8x8) -> u8x8 {
+        unsafe {arm_vtbl4_u8((t0, t1, t2, t3), self)}
+    }
+}
+
+#[doc(hidden)]
+pub mod common {
+    use super::super::super::*;
+    use super::*;
+    use std::mem;
+
+    #[inline]
+    pub fn f32x4_sqrt(x: f32x4) -> f32x4 {
+        unsafe {super::arm_vsqrtq_f32(x)}
+    }
+    #[inline]
+    pub fn f32x4_approx_rsqrt(x: f32x4) -> f32x4 {
+        unsafe {super::arm_vrsqrteq_f32(x)}
+    }
+    #[inline]
+    pub fn f32x4_approx_reciprocal(x: f32x4) -> f32x4 {
+        unsafe {super::arm_vrecpeq_f32(x)}
+    }
+    #[inline]
+    pub fn f32x4_max(x: f32x4, y: f32x4) -> f32x4 {
+        unsafe {super::arm_vmaxq_f32(x, y)}
+    }
+    #[inline]
+    pub fn f32x4_min(x: f32x4, y: f32x4) -> f32x4 {
+        unsafe {super::arm_vminq_f32(x, y)}
+    }
+
+    macro_rules! bools {
+        ($($ty: ty, $half: ty, $all: ident ($min: ident), $any: ident ($max: ident);)*) => {
+            $(
+                #[inline]
+                pub fn $all(x: $ty) -> bool {
+                    unsafe {
+                        let (lo, hi): ($half, $half) = mem::transmute(x);
+                        let x = super::$min(lo, hi);
+                        let y = super::$min(x, mem::uninitialized());
+                        y.0 != 0
+                    }
+                }
+                #[inline]
+                pub fn $any(x: $ty) -> bool {
+                    unsafe {
+                        let (lo, hi): ($half, $half) = mem::transmute(x);
+                        let x = super::$max(lo, hi);
+                        let y = super::$max(x, mem::uninitialized());
+                        y.0 != 0
+                    }
+                }
+                )*
+        }
+    }
+
+    bools! {
+        bool32fx4, arm::neon::u32x2, bool32fx4_all(arm_vpmin_u32), bool32fx4_any(arm_vpmax_u32);
+        bool8ix16, arm::neon::u8x8, bool8ix16_all(arm_vpmin_u8), bool8ix16_any(arm_vpmax_u8);
+        bool16ix8, arm::neon::u16x4, bool16ix8_all(arm_vpmin_u16), bool16ix8_any(arm_vpmax_u16);
+        bool32ix4, arm::neon::u32x2, bool32ix4_all(arm_vpmin_u32), bool32ix4_any(arm_vpmax_u32);
+    }
+}
--- a/third_party/rust/simd/src/common.rs
+++ b/third_party/rust/simd/src/common.rs
@ -0,0 +1,521 @@
+use super::*;
+#[allow(unused_imports)]
+use super::{
+    simd_eq, simd_ne, simd_lt, simd_le, simd_gt, simd_ge,
+    simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
+    simd_insert, simd_extract,
+    simd_cast,
+    simd_add, simd_sub, simd_mul, simd_div, simd_shl, simd_shr, simd_and, simd_or, simd_xor,
+
+    Unalign, bitcast,
+};
+use std::mem;
+use std::ops;
+
+#[cfg(any(target_arch = "x86",
+          target_arch = "x86_64"))]
+use x86::sse2::common;
+#[cfg(any(target_arch = "arm"))]
+use arm::neon::common;
+#[cfg(any(target_arch = "aarch64"))]
+use aarch64::neon::common;
+
+macro_rules! basic_impls {
+    ($(
+        $name: ident:
+        $elem: ident, $bool: ident, $shuffle: ident, $length: expr, $($first: ident),* | $($last: ident),*;
+        )*) => {
+        $(impl $name {
+            /// Create a new instance.
+            #[inline]
+            pub const fn new($($first: $elem),*, $($last: $elem),*) -> $name {
+                $name($($first),*, $($last),*)
+            }
+
+            /// Create a new instance where every lane has value `x`.
+            #[inline]
+            pub const fn splat(x: $elem) -> $name {
+                $name($({ #[allow(dead_code)] struct $first; x }),*,
+                      $({ #[allow(dead_code)] struct $last; x }),*)
+            }
+
+            /// Compare for equality.
+            #[inline]
+            pub fn eq(self, other: Self) -> $bool {
+                unsafe {simd_eq(self, other)}
+            }
+            /// Compare for equality.
+            #[inline]
+            pub fn ne(self, other: Self) -> $bool {
+                unsafe {simd_ne(self, other)}
+            }
+            /// Compare for equality.
+            #[inline]
+            pub fn lt(self, other: Self) -> $bool {
+                unsafe {simd_lt(self, other)}
+            }
+            /// Compare for equality.
+            #[inline]
+            pub fn le(self, other: Self) -> $bool {
+                unsafe {simd_le(self, other)}
+            }
+            /// Compare for equality.
+            #[inline]
+            pub fn gt(self, other: Self) -> $bool {
+                unsafe {simd_gt(self, other)}
+            }
+            /// Compare for equality.
+            #[inline]
+            pub fn ge(self, other: Self) -> $bool {
+                unsafe {simd_ge(self, other)}
+            }
+
+            /// Extract the value of the `idx`th lane of `self`.
+            ///
+            /// # Panics
+            ///
+            /// `extract` will panic if `idx` is out of bounds.
+            #[inline]
+            pub fn extract(self, idx: u32) -> $elem {
+                assert!(idx < $length);
+                unsafe {simd_extract(self, idx)}
+            }
+            /// Return a new vector where the `idx`th lane is replaced
+            /// by `elem`.
+            ///
+            /// # Panics
+            ///
+            /// `replace` will panic if `idx` is out of bounds.
+            #[inline]
+            pub fn replace(self, idx: u32, elem: $elem) -> Self {
+                assert!(idx < $length);
+                unsafe {simd_insert(self, idx, elem)}
+            }
+
+            /// Load a new value from the `idx`th position of `array`.
+            ///
+            /// This is equivalent to the following, but is possibly
+            /// more efficient:
+            ///
+            /// ```rust,ignore
+            /// Self::new(array[idx], array[idx + 1], ...)
+            /// ```
+            ///
+            /// # Panics
+            ///
+            /// `load` will panic if `idx` is out of bounds in
+            /// `array`, or if `array[idx..]` is too short.
+            #[inline]
+            pub fn load(array: &[$elem], idx: usize) -> Self {
+                let data = &array[idx..idx + $length];
+                let loaded = unsafe {
+                    *(data.as_ptr() as *const Unalign<Self>)
+                };
+                loaded.0
+            }
+
+            /// Store the elements of `self` to `array`, starting at
+            /// the `idx`th position.
+            ///
+            /// This is equivalent to the following, but is possibly
+            /// more efficient:
+            ///
+            /// ```rust,ignore
+            /// array[i] = self.extract(0);
+            /// array[i + 1] = self.extract(1);
+            /// // ...
+            /// ```
+            ///
+            /// # Panics
+            ///
+            /// `store` will panic if `idx` is out of bounds in
+            /// `array`, or if `array[idx...]` is too short.
+            #[inline]
+            pub fn store(self, array: &mut [$elem], idx: usize) {
+                let place = &mut array[idx..idx + $length];
+                unsafe {
+                    *(place.as_mut_ptr() as *mut Unalign<Self>) = Unalign(self)
+                }
+            }
+        })*
+    }
+}
+
+basic_impls! {
+    u32x4: u32, bool32ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
+    i32x4: i32, bool32ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
+    f32x4: f32, bool32fx4, simd_shuffle4, 4, x0, x1 | x2, x3;
+
+    u16x8: u16, bool16ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
+    i16x8: i16, bool16ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
+
+    u8x16: u8, bool8ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
+    i8x16: i8, bool8ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
+}
+
+macro_rules! bool_impls {
+    ($(
+        $name: ident:
+        $elem: ident, $repr: ident, $repr_elem: ident, $length: expr, $all: ident, $any: ident,
+        $($first: ident),* | $($last: ident),*
+        [$(#[$cvt_meta: meta] $cvt: ident -> $cvt_to: ident),*];
+        )*) => {
+        $(impl $name {
+            /// Convert to integer representation.
+            #[inline]
+            pub fn to_repr(self) -> $repr {
+                unsafe {mem::transmute(self)}
+            }
+            /// Convert from integer representation.
+            #[inline]
+            #[inline]
+            pub fn from_repr(x: $repr) -> Self {
+                unsafe {mem::transmute(x)}
+            }
+
+            /// Create a new instance.
+            #[inline]
+            pub fn new($($first: bool),*, $($last: bool),*) -> $name {
+                unsafe {
+                    // negate everything together
+                    simd_sub($name::splat(false),
+                             $name($( ($first as $repr_elem) ),*,
+                                   $( ($last as $repr_elem) ),*))
+                }
+            }
+
+            /// Create a new instance where every lane has value `x`.
+            #[allow(unused_variables)]
+            #[inline]
+            pub fn splat(x: bool) -> $name {
+                let x = if x {!(0 as $repr_elem)} else {0};
+                $name($({ let $first = (); x}),*,
+                      $({ let $last = (); x}),*)
+            }
+
+            /// Extract the value of the `idx`th lane of `self`.
+            ///
+            /// # Panics
+            ///
+            /// `extract` will panic if `idx` is out of bounds.
+            #[inline]
+            pub fn extract(self, idx: u32) -> bool {
+                assert!(idx < $length);
+                unsafe {simd_extract(self.to_repr(), idx) != 0}
+            }
+            /// Return a new vector where the `idx`th lane is replaced
+            /// by `elem`.
+            ///
+            /// # Panics
+            ///
+            /// `replace` will panic if `idx` is out of bounds.
+            #[inline]
+            pub fn replace(self, idx: u32, elem: bool) -> Self {
+                assert!(idx < $length);
+                let x = if elem {!(0 as $repr_elem)} else {0};
+                unsafe {Self::from_repr(simd_insert(self.to_repr(), idx, x))}
+            }
+            /// Select between elements of `then` and `else_`, based on
+            /// the corresponding element of `self`.
+            ///
+            /// This is equivalent to the following, but is possibly
+            /// more efficient:
+            ///
+            /// ```rust,ignore
+            /// T::new(if self.extract(0) { then.extract(0) } else { else_.extract(0) },
+            ///        if self.extract(1) { then.extract(1) } else { else_.extract(1) },
+            ///        ...)
+            /// ```
+            #[inline]
+            pub fn select<T: Simd<Bool = $name>>(self, then: T, else_: T) -> T {
+                let then: $repr = bitcast(then);
+                let else_: $repr = bitcast(else_);
+                bitcast((then & self.to_repr()) | (else_ & (!self).to_repr()))
+            }
+
+            /// Check if every element of `self` is true.
+            ///
+            /// This is equivalent to the following, but is possibly
+            /// more efficient:
+            ///
+            /// ```rust,ignore
+            /// self.extract(0) && self.extract(1) && ...
+            /// ```
+            #[inline]
+            pub fn all(self) -> bool {
+                common::$all(self)
+            }
+            /// Check if any element of `self` is true.
+            ///
+            /// This is equivalent to the following, but is possibly
+            /// more efficient:
+            ///
+            /// ```rust,ignore
+            /// self.extract(0) || self.extract(1) || ...
+            /// ```
+            #[inline]
+            pub fn any(self) -> bool {
+                common::$any(self)
+            }
+
+            $(
+                #[$cvt_meta]
+                #[inline]
+                pub fn $cvt(self) -> $cvt_to {
+                    bitcast(self)
+                }
+                )*
+        }
+          impl ops::Not for $name {
+              type Output = Self;
+
+              #[inline]
+              fn not(self) -> Self {
+                  Self::from_repr($repr::splat(!(0 as $repr_elem)) ^ self.to_repr())
+              }
+          }
+          )*
+    }
+}
+
+bool_impls! {
+    bool32ix4: bool32i, i32x4, i32, 4, bool32ix4_all, bool32ix4_any, x0, x1 | x2, x3
+        [/// Convert `self` to a boolean vector for interacting with floating point vectors.
+         to_f -> bool32fx4];
+    bool32fx4: bool32f, i32x4, i32, 4, bool32fx4_all, bool32fx4_any, x0, x1 | x2, x3
+        [/// Convert `self` to a boolean vector for interacting with integer vectors.
+         to_i -> bool32ix4];
+
+    bool16ix8: bool16i, i16x8, i16, 8, bool16ix8_all, bool16ix8_any, x0, x1, x2, x3 | x4, x5, x6, x7 [];
+
+    bool8ix16: bool8i, i8x16, i8, 16, bool8ix16_all, bool8ix16_any, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15 [];
+}
+
+impl u32x4 {
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i32(self) -> i32x4 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to a 32-bit float.
+    #[inline]
+    pub fn to_f32(self) -> f32x4 {
+        unsafe {simd_cast(self)}
+    }
+}
+impl i32x4 {
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u32(self) -> u32x4 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to a 32-bit float.
+    #[inline]
+    pub fn to_f32(self) -> f32x4 {
+        unsafe {simd_cast(self)}
+    }
+}
+impl f32x4 {
+    /// Compute the square root of each lane.
+    #[inline]
+    pub fn sqrt(self) -> Self {
+        common::f32x4_sqrt(self)
+    }
+    /// Compute an approximation to the reciprocal of the square root
+    /// of `self`, that is, `f32::splat(1.0) / self.sqrt()`.
+    ///
+    /// The accuracy of this approximation is platform dependent.
+    #[inline]
+    pub fn approx_rsqrt(self) -> Self {
+        common::f32x4_approx_rsqrt(self)
+    }
+    /// Compute an approximation to the reciprocal of `self`, that is,
+    /// `f32::splat(1.0) / self`.
+    ///
+    /// The accuracy of this approximation is platform dependent.
+    #[inline]
+    pub fn approx_reciprocal(self) -> Self {
+        common::f32x4_approx_reciprocal(self)
+    }
+    /// Compute the lane-wise maximum of `self` and `other`.
+    ///
+    /// This is equivalent to the following, but is possibly more
+    /// efficient:
+    ///
+    /// ```rust,ignore
+    /// f32x4::new(self.extract(0).max(other.extract(0)),
+    ///            self.extract(1).max(other.extract(1)),
+    ///            ...)
+    /// ```
+    #[inline]
+    pub fn max(self, other: Self) -> Self {
+        common::f32x4_max(self, other)
+    }
+    /// Compute the lane-wise minimum of `self` and `other`.
+    ///
+    /// This is equivalent to the following, but is possibly more
+    /// efficient:
+    ///
+    /// ```rust,ignore
+    /// f32x4::new(self.extract(0).min(other.extract(0)),
+    ///            self.extract(1).min(other.extract(1)),
+    ///            ...)
+    /// ```
+    #[inline]
+    pub fn min(self, other: Self) -> Self {
+        common::f32x4_min(self, other)
+    }
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i32(self) -> i32x4 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u32(self) -> u32x4 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+impl i16x8 {
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u16(self) -> u16x8 {
+        unsafe {simd_cast(self)}
+    }
+}
+impl u16x8 {
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i16(self) -> i16x8 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+impl i8x16 {
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u8(self) -> u8x16 {
+        unsafe {simd_cast(self)}
+    }
+}
+impl u8x16 {
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i8(self) -> i8x16 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+
+macro_rules! neg_impls {
+    ($zero: expr, $($ty: ident,)*) => {
+        $(impl ops::Neg for $ty {
+            type Output = Self;
+            fn neg(self) -> Self {
+                $ty::splat($zero) - self
+            }
+        })*
+    }
+}
+neg_impls!{
+    0,
+    i32x4,
+    i16x8,
+    i8x16,
+}
+neg_impls! {
+    0.0,
+    f32x4,
+}
+macro_rules! not_impls {
+    ($($ty: ident,)*) => {
+        $(impl ops::Not for $ty {
+            type Output = Self;
+            fn not(self) -> Self {
+                $ty::splat(!0) ^ self
+            }
+        })*
+    }
+}
+not_impls! {
+    i32x4,
+    i16x8,
+    i8x16,
+    u32x4,
+    u16x8,
+    u8x16,
+}
+
+macro_rules! operators {
+    ($($trayt: ident ($func: ident, $method: ident): $($ty: ty),*;)*) => {
+        $(
+            $(impl ops::$trayt for $ty {
+                type Output = Self;
+                #[inline]
+                fn $method(self, x: Self) -> Self {
+                    unsafe {$func(self, x)}
+                }
+            })*
+                )*
+    }
+}
+operators! {
+    Add (simd_add, add):
+        i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
+        f32x4;
+    Sub (simd_sub, sub):
+        i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
+        f32x4;
+    Mul (simd_mul, mul):
+        i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
+        f32x4;
+    Div (simd_div, div): f32x4;
+
+    BitAnd (simd_and, bitand):
+        i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
+        bool8ix16, bool16ix8, bool32ix4,
+        bool32fx4;
+    BitOr (simd_or, bitor):
+        i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
+        bool8ix16, bool16ix8, bool32ix4,
+        bool32fx4;
+    BitXor (simd_xor, bitxor):
+        i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
+        bool8ix16, bool16ix8, bool32ix4,
+        bool32fx4;
+}
+
+macro_rules! shift_one {
+    ($ty: ident, $($by: ident),*) => {
+        $(
+        impl ops::Shl<$by> for $ty {
+            type Output = Self;
+            #[inline]
+            fn shl(self, other: $by) -> Self {
+                unsafe { simd_shl(self, $ty::splat(other as <$ty as Simd>::Elem)) }
+            }
+        }
+        impl ops::Shr<$by> for $ty {
+            type Output = Self;
+            #[inline]
+            fn shr(self, other: $by) -> Self {
+                unsafe {simd_shr(self, $ty::splat(other as <$ty as Simd>::Elem))}
+            }
+        }
+            )*
+    }
+}
+
+macro_rules! shift {
+    ($($ty: ident),*) => {
+        $(shift_one! {
+            $ty,
+            u8, u16, u32, u64, usize,
+            i8, i16, i32, i64, isize
+        })*
+    }
+}
+shift! {
+    i8x16, u8x16, i16x8, u16x8, i32x4, u32x4
+}
--- a/third_party/rust/simd/src/lib.rs
+++ b/third_party/rust/simd/src/lib.rs
@ -0,0 +1,225 @@
+//! `simd` offers a basic interface to the SIMD functionality of CPUs.
+
+#![feature(cfg_target_feature, repr_simd, platform_intrinsics, const_fn)]
+#![allow(non_camel_case_types)]
+
+#[cfg(feature = "with-serde")]
+extern crate serde;
+#[cfg(feature = "with-serde")]
+#[macro_use]
+extern crate serde_derive;
+
+/// Boolean type for 8-bit integers.
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct bool8i(i8);
+/// Boolean type for 16-bit integers.
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct bool16i(i16);
+/// Boolean type for 32-bit integers.
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct bool32i(i32);
+/// Boolean type for 32-bit floats.
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct bool32f(i32);
+
+macro_rules! bool {
+    ($($name: ident, $inner: ty;)*) => {
+        $(
+            impl From<bool> for $name {
+                #[inline]
+                fn from(b: bool) -> $name {
+                    $name(-(b as $inner))
+                }
+            }
+            impl From<$name> for bool {
+                #[inline]
+                fn from(b: $name) -> bool {
+                    b.0 != 0
+                }
+            }
+            )*
+    }
+}
+bool! {
+    bool8i, i8;
+    bool16i, i16;
+    bool32i, i32;
+    bool32f, i32;
+}
+
+/// Types that are SIMD vectors.
+pub unsafe trait Simd {
+    /// The corresponding boolean vector type.
+    type Bool: Simd;
+    /// The element that this vector stores.
+    type Elem;
+}
+
+/// A SIMD vector of 4 `u32`s.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct u32x4(u32, u32, u32, u32);
+/// A SIMD vector of 4 `i32`s.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct i32x4(i32, i32, i32, i32);
+/// A SIMD vector of 4 `f32`s.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct f32x4(f32, f32, f32, f32);
+/// A SIMD boolean vector for length-4 vectors of 32-bit integers.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool32ix4(i32, i32, i32, i32);
+/// A SIMD boolean vector for length-4 vectors of 32-bit floats.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool32fx4(i32, i32, i32, i32);
+
+#[allow(dead_code)]
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+struct u32x2(u32, u32);
+#[allow(dead_code)]
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+struct i32x2(i32, i32);
+#[allow(dead_code)]
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+struct f32x2(f32, f32);
+#[allow(dead_code)]
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+struct bool32ix2(i32, i32);
+#[allow(dead_code)]
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+struct bool32fx2(i32, i32);
+
+/// A SIMD vector of 8 `u16`s.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct u16x8(u16, u16, u16, u16,
+                 u16, u16, u16, u16);
+/// A SIMD vector of 8 `i16`s.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct i16x8(i16, i16, i16, i16,
+                 i16, i16, i16, i16);
+/// A SIMD boolean vector for length-8 vectors of 16-bit integers.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool16ix8(i16, i16, i16, i16,
+                     i16, i16, i16, i16);
+
+/// A SIMD vector of 16 `u8`s.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct u8x16(u8, u8, u8, u8, u8, u8, u8, u8,
+                 u8, u8, u8, u8, u8, u8, u8, u8);
+/// A SIMD vector of 16 `i8`s.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct i8x16(i8, i8, i8, i8, i8, i8, i8, i8,
+                 i8, i8, i8, i8, i8, i8, i8, i8);
+/// A SIMD boolean vector for length-16 vectors of 8-bit integers.
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool8ix16(i8, i8, i8, i8, i8, i8, i8, i8,
+                     i8, i8, i8, i8, i8, i8, i8, i8);
+
+
+macro_rules! simd {
+    ($($bool: ty: $($ty: ty = $elem: ty),*;)*) => {
+        $($(unsafe impl Simd for $ty {
+            type Bool = $bool;
+            type Elem = $elem;
+        }
+            impl Clone for $ty { #[inline] fn clone(&self) -> Self { *self } }
+            )*)*}
+}
+simd! {
+    bool8ix16: i8x16 = i8, u8x16 = u8, bool8ix16 = bool8i;
+    bool16ix8: i16x8 = i16, u16x8 = u16, bool16ix8 = bool16i;
+    bool32ix4: i32x4 = i32, u32x4 = u32, bool32ix4 = bool32i;
+    bool32fx4: f32x4 = f32, bool32fx4 = bool32f;
+
+    bool32ix2: i32x2 = i32, u32x2 = u32, bool32ix2 = bool32i;
+    bool32fx2: f32x2 = f32, bool32fx2 = bool32f;
+}
+
+#[allow(dead_code)]
+#[inline]
+fn bitcast<T: Simd, U: Simd>(x: T) -> U {
+    assert_eq!(std::mem::size_of::<T>(),
+               std::mem::size_of::<U>());
+    unsafe {std::mem::transmute_copy(&x)}
+}
+
+#[allow(dead_code)]
+extern "platform-intrinsic" {
+    fn simd_eq<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
+    fn simd_ne<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
+    fn simd_lt<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
+    fn simd_le<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
+    fn simd_gt<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
+    fn simd_ge<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
+
+    fn simd_shuffle2<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 2]) -> U;
+    fn simd_shuffle4<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 4]) -> U;
+    fn simd_shuffle8<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 8]) -> U;
+    fn simd_shuffle16<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 16]) -> U;
+
+    fn simd_insert<T: Simd<Elem = U>, U>(x: T, idx: u32, val: U) -> T;
+    fn simd_extract<T: Simd<Elem = U>, U>(x: T, idx: u32) -> U;
+
+    fn simd_cast<T: Simd, U: Simd>(x: T) -> U;
+
+    fn simd_add<T: Simd>(x: T, y: T) -> T;
+    fn simd_sub<T: Simd>(x: T, y: T) -> T;
+    fn simd_mul<T: Simd>(x: T, y: T) -> T;
+    fn simd_div<T: Simd>(x: T, y: T) -> T;
+    fn simd_shl<T: Simd>(x: T, y: T) -> T;
+    fn simd_shr<T: Simd>(x: T, y: T) -> T;
+    fn simd_and<T: Simd>(x: T, y: T) -> T;
+    fn simd_or<T: Simd>(x: T, y: T) -> T;
+    fn simd_xor<T: Simd>(x: T, y: T) -> T;
+}
+#[repr(packed)]
+#[derive(Debug, Copy, Clone)]
+struct Unalign<T>(T);
+
+#[macro_use]
+mod common;
+mod sixty_four;
+mod v256;
+
+#[cfg(any(feature = "doc",
+          target_arch = "x86",
+          target_arch = "x86_64"))]
+pub mod x86;
+#[cfg(any(feature = "doc", target_arch = "arm"))]
+pub mod arm;
+#[cfg(any(feature = "doc", target_arch = "aarch64"))]
+pub mod aarch64;
--- a/third_party/rust/simd/src/sixty_four.rs
+++ b/third_party/rust/simd/src/sixty_four.rs
@ -0,0 +1,229 @@
+#![allow(dead_code)]
+use super::*;
+#[allow(unused_imports)]
+use super::{
+    f32x2,
+    simd_eq, simd_ne, simd_lt, simd_le, simd_gt, simd_ge,
+    simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
+    simd_insert, simd_extract,
+    simd_cast,
+    simd_add, simd_sub, simd_mul, simd_div, simd_shl, simd_shr, simd_and, simd_or, simd_xor,
+
+    Unalign, bitcast,
+};
+use std::mem;
+use std::ops;
+
+/// Boolean type for 64-bit integers.
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy, Clone)]
+pub struct bool64i(i64);
+/// Boolean type for 64-bit floats.
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy, Clone)]
+pub struct bool64f(i64);
+/// A SIMD vector of 2 `u64`s.
+#[repr(simd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct u64x2(u64, u64);
+/// A SIMD vector of 2 `i64`s.
+#[repr(simd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct i64x2(i64, i64);
+/// A SIMD vector of 2 `f64`s.
+#[repr(simd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct f64x2(f64, f64);
+/// A SIMD boolean vector for length-2 vectors of 64-bit integers.
+#[repr(simd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool64ix2(i64, i64);
+/// A SIMD boolean vector for length-2 vectors of 64-bit floats.
+#[repr(simd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool64fx2(i64, i64);
+
+simd! {
+    bool64ix2: i64x2 = i64, u64x2 = u64, bool64ix2 = bool64i;
+    bool64fx2: f64x2 = f64, bool64fx2 = bool64f;
+}
+basic_impls! {
+    u64x2: u64, bool64ix2, simd_shuffle2, 2, x0 | x1;
+    i64x2: i64, bool64ix2, simd_shuffle2, 2, x0 | x1;
+    f64x2: f64, bool64fx2, simd_shuffle2, 2, x0 | x1;
+}
+
+mod common {
+    use super::*;
+    // naive for now
+    #[inline]
+    pub fn bool64ix2_all(x: bool64ix2) -> bool {
+        x.0 != 0 && x.1 != 0
+    }
+    #[inline]
+    pub fn bool64ix2_any(x: bool64ix2) -> bool {
+        x.0 != 0 || x.1 != 0
+    }
+    #[inline]
+    pub fn bool64fx2_all(x: bool64fx2) -> bool {
+        x.0 != 0 && x.1 != 0
+    }
+    #[inline]
+    pub fn bool64fx2_any(x: bool64fx2) -> bool {
+        x.0 != 0 || x.1 != 0
+    }}
+bool_impls! {
+    bool64ix2: bool64i, i64x2, i64, 2, bool64ix2_all, bool64ix2_any, x0 | x1
+        [/// Convert `self` to a boolean vector for interacting with floating point vectors.
+         to_f -> bool64fx2];
+
+    bool64fx2: bool64f, i64x2, i64, 2, bool64fx2_all, bool64fx2_any, x0 | x1
+        [/// Convert `self` to a boolean vector for interacting with integer vectors.
+         to_i -> bool64ix2];
+}
+
+impl u64x2 {
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i64(self) -> i64x2 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to a 64-bit float.
+    #[inline]
+    pub fn to_f64(self) -> f64x2 {
+        unsafe {simd_cast(self)}
+    }
+}
+impl i64x2 {
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u64(self) -> u64x2 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to a 64-bit float.
+    #[inline]
+    pub fn to_f64(self) -> f64x2 {
+        unsafe {simd_cast(self)}
+    }
+}
+impl f64x2 {
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i64(self) -> i64x2 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u64(self) -> u64x2 {
+        unsafe {simd_cast(self)}
+    }
+
+    /// Convert each lane to a 32-bit float.
+    #[inline]
+    pub fn to_f32(self) -> f32x4 {
+        unsafe {
+            let x: f32x2 = simd_cast(self);
+            f32x4::new(x.0, x.1, 0.0, 0.0)
+        }
+    }
+}
+
+neg_impls!{
+    0,
+    i64x2,
+}
+neg_impls! {
+    0.0,
+    f64x2,
+}
+macro_rules! not_impls {
+    ($($ty: ident,)*) => {
+        $(impl ops::Not for $ty {
+            type Output = Self;
+            fn not(self) -> Self {
+                $ty::splat(!0) ^ self
+            }
+        })*
+    }
+}
+not_impls! {
+    i64x2,
+    u64x2,
+}
+
+macro_rules! operators {
+    ($($trayt: ident ($func: ident, $method: ident): $($ty: ty),*;)*) => {
+        $(
+            $(impl ops::$trayt for $ty {
+                type Output = Self;
+                #[inline]
+                fn $method(self, x: Self) -> Self {
+                    unsafe {$func(self, x)}
+                }
+            })*
+                )*
+    }
+}
+operators! {
+    Add (simd_add, add):
+        i64x2, u64x2,
+        f64x2;
+    Sub (simd_sub, sub):
+        i64x2, u64x2,
+        f64x2;
+    Mul (simd_mul, mul):
+        i64x2, u64x2,
+        f64x2;
+    Div (simd_div, div): f64x2;
+
+    BitAnd (simd_and, bitand):
+        i64x2, u64x2,
+        bool64ix2,
+        bool64fx2;
+    BitOr (simd_or, bitor):
+        i64x2, u64x2,
+        bool64ix2,
+        bool64fx2;
+    BitXor (simd_xor, bitxor):
+        i64x2, u64x2,
+        bool64ix2,
+        bool64fx2;
+}
+
+macro_rules! shift_one { ($ty: ident, $($by: ident),*) => {
+        $(
+        impl ops::Shl<$by> for $ty {
+            type Output = Self;
+            #[inline]
+            fn shl(self, other: $by) -> Self {
+                unsafe { simd_shl(self, $ty::splat(other as <$ty as Simd>::Elem)) }
+            }
+        }
+        impl ops::Shr<$by> for $ty {
+            type Output = Self;
+            #[inline]
+            fn shr(self, other: $by) -> Self {
+                unsafe {simd_shr(self, $ty::splat(other as <$ty as Simd>::Elem))}
+            }
+        }
+            )*
+    }
+}
+
+macro_rules! shift {
+    ($($ty: ident),*) => {
+        $(shift_one! {
+            $ty,
+            u8, u16, u32, u64, usize,
+            i8, i16, i32, i64, isize
+        })*
+    }
+}
+shift! {
+    i64x2, u64x2
+}
--- a/third_party/rust/simd/src/v256.rs
+++ b/third_party/rust/simd/src/v256.rs
@ -0,0 +1,424 @@
+#![allow(dead_code)]
+use std::ops;
+use std::mem;
+#[allow(unused_imports)]
+use super::{
+	Simd,
+    u32x4, i32x4, u16x8, i16x8, u8x16, i8x16, f32x4,
+    bool32ix4, bool16ix8, bool8ix16, bool32fx4,
+    simd_eq, simd_ne, simd_lt, simd_le, simd_gt, simd_ge,
+    simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
+    simd_insert, simd_extract,
+    simd_cast,
+    simd_add, simd_sub, simd_mul, simd_div, simd_shl, simd_shr, simd_and, simd_or, simd_xor,
+    bool8i, bool16i, bool32i, bool32f,
+    Unalign, bitcast,
+};
+use super::sixty_four::*;
+#[cfg(all(target_feature = "avx"))]
+use super::x86::avx::common;
+
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct u64x4(u64, u64, u64, u64);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct i64x4(i64, i64, i64, i64);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct f64x4(f64, f64, f64, f64);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool64ix4(i64, i64, i64, i64);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool64fx4(i64, i64, i64, i64);
+
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct u32x8(u32, u32, u32, u32,
+                 u32, u32, u32, u32);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct i32x8(i32, i32, i32, i32,
+                 i32, i32, i32, i32);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct f32x8(f32, f32, f32, f32,
+                 f32, f32, f32, f32);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool32ix8(i32, i32, i32, i32,
+                     i32, i32, i32, i32);#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool32fx8(i32, i32, i32, i32,
+                     i32, i32, i32, i32);
+
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct u16x16(u16, u16, u16, u16, u16, u16, u16, u16,
+                  u16, u16, u16, u16, u16, u16, u16, u16);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct i16x16(i16, i16, i16, i16, i16, i16, i16, i16,
+                  i16, i16, i16, i16, i16, i16, i16, i16);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool16ix16(i16, i16, i16, i16, i16, i16, i16, i16,
+                      i16, i16, i16, i16, i16, i16, i16, i16);
+
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct u8x32(u8, u8, u8, u8, u8, u8, u8, u8,
+                 u8, u8, u8, u8, u8, u8, u8, u8,
+                 u8, u8, u8, u8, u8, u8, u8, u8,
+                 u8, u8, u8, u8, u8, u8, u8, u8);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct i8x32(i8, i8, i8, i8, i8, i8, i8, i8,
+                 i8, i8, i8, i8, i8, i8, i8, i8,
+                 i8, i8, i8, i8, i8, i8, i8, i8,
+                 i8, i8, i8, i8, i8, i8, i8, i8);
+#[repr(simd)]
+#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Copy)]
+pub struct bool8ix32(i8, i8, i8, i8, i8, i8, i8, i8,
+                     i8, i8, i8, i8, i8, i8, i8, i8,
+                     i8, i8, i8, i8, i8, i8, i8, i8,
+                     i8, i8, i8, i8, i8, i8, i8, i8);
+
+simd! {
+    bool8ix32: i8x32 = i8, u8x32 = u8, bool8ix32 = bool8i;
+    bool16ix16: i16x16 = i16, u16x16 = u16, bool16ix16 = bool16i;
+    bool32ix8: i32x8 = i32, u32x8 = u32, bool32ix8 = bool32i;
+    bool64ix4: i64x4 = i64, u64x4 = u64, bool64ix4 = bool64i;
+
+    bool32fx8: f32x8 = f32, bool32fx8 = bool32f;
+    bool64fx4: f64x4 = f64, bool64fx4 = bool64f;
+}
+
+basic_impls! {
+    u64x4: u64, bool64ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
+    i64x4: i64, bool64ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
+    f64x4: f64, bool64fx4, simd_shuffle4, 4, x0, x1 | x2, x3;
+
+    u32x8: u32, bool32ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
+    i32x8: i32, bool32ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
+    f32x8: f32, bool32fx8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
+
+    u16x16: u16, bool16ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
+    i16x16: i16, bool16ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
+
+    u8x32: u8, bool8ix32, simd_shuffle32, 32, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
+        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31;
+    i8x32: i8, bool8ix32, simd_shuffle32, 32, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
+        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31;
+}
+
+#[cfg(all(not(target_feature = "avx")))]
+#[doc(hidden)]
+mod common {
+    use super::*;
+    // implementation via SSE vectors
+    macro_rules! bools {
+        ($($ty: ty, $all: ident, $any: ident;)*) => {
+            $(
+                #[inline]
+                pub fn $all(x: $ty) -> bool {
+                    x.low().all() && x.high().all()
+                }
+                #[inline]
+                pub fn $any(x: $ty) -> bool {
+                    x.low().any() || x.high().any()
+                }
+                )*
+        }
+    }
+
+    bools! {
+        bool64ix4, bool64ix4_all, bool64ix4_any;
+        bool64fx4, bool64fx4_all, bool64fx4_any;
+        bool32ix8, bool32ix8_all, bool32ix8_any;
+        bool32fx8, bool32fx8_all, bool32fx8_any;
+        bool16ix16, bool16ix16_all, bool16ix16_any;
+        bool8ix32, bool8ix32_all, bool8ix32_any;
+    }
+
+}
+
+bool_impls! {
+    bool64ix4: bool64i, i64x4, i64, 4, bool64ix4_all, bool64ix4_any, x0, x1 | x2, x3
+        [/// Convert `self` to a boolean vector for interacting with floating point vectors.
+         to_f -> bool64fx4];
+
+    bool64fx4: bool64f, i64x4, i64, 4, bool64fx4_all, bool64fx4_any, x0, x1 | x2, x3
+        [/// Convert `self` to a boolean vector for interacting with integer vectors.
+         to_i -> bool64ix4];
+
+    bool32ix8: bool32i, i32x8, i32, 8, bool32ix8_all, bool32ix8_any, x0, x1, x2, x3 | x4, x5, x6, x7
+        [/// Convert `self` to a boolean vector for interacting with floating point vectors.
+         to_f -> bool32fx8];
+
+    bool32fx8: bool32f, i32x8, i32, 8, bool32fx8_all, bool32fx8_any, x0, x1, x2, x3 | x4, x5, x6, x7
+        [/// Convert `self` to a boolean vector for interacting with integer vectors.
+         to_i -> bool32ix8];
+
+    bool16ix16: bool16i, i16x16, i16, 16, bool16ix16_all, bool16ix16_any,
+            x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15 [];
+
+    bool8ix32: bool8i, i8x32, i8, 32, bool8ix32_all, bool8ix32_any,
+        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
+        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 [];
+}
+
+pub trait LowHigh128 {
+    type Half: Simd;
+    /// Extract the low 128 bit part.
+    fn low(self) -> Self::Half;
+    /// Extract the high 128 bit part.
+    fn high(self) -> Self::Half;
+}
+
+macro_rules! expr { ($x:expr) => ($x) } // HACK
+macro_rules! low_high_impls {
+    ($(
+        $name: ident, $half: ident, $($first: tt),+ ... $($last: tt),+;
+        )*) => {
+        $(impl LowHigh128 for $name {
+            type Half = $half;
+            #[inline]
+            fn low(self) -> Self::Half {
+                $half::new($( expr!(self.$first), )*)
+            }
+
+            #[inline]
+            fn high(self) -> Self::Half {
+                $half::new($( expr!(self.$last), )*)
+            }
+        })*
+    }
+}
+
+low_high_impls! {
+    u64x4, u64x2, 0, 1 ... 2, 3;
+    i64x4, i64x2, 0, 1 ... 2, 3;
+    f64x4, f64x2, 0, 1 ... 2, 3;
+
+    u32x8, u32x4, 0, 1, 2, 3 ... 4, 5, 6, 7;
+    i32x8, i32x4, 0, 1, 2, 3 ... 4, 5, 6, 7;
+    f32x8, f32x4, 0, 1, 2, 3 ... 4, 5, 6, 7;
+
+    u16x16, u16x8, 0, 1, 2, 3, 4, 5, 6, 7 ... 8, 9, 10, 11, 12, 13, 14, 15;
+    i16x16, i16x8, 0, 1, 2, 3, 4, 5, 6, 7 ... 8, 9, 10, 11, 12, 13, 14, 15;
+
+    u8x32, u8x16,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ...
+        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31;
+    i8x32, i8x16,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ...
+        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31;
+
+}
+
+macro_rules! bool_low_high_impls {
+    ($(
+        $name: ident: $half: ident;
+        )*) => {
+        $(impl LowHigh128 for $name {
+            type Half = $half;
+            /// Extract the low 128 bit part.
+            #[inline]
+            fn low(self) -> Self::Half {
+                Self::Half::from_repr(self.to_repr().low())
+            }
+
+            /// Extract the high 128 bit part.
+            #[inline]
+            fn high(self) -> Self::Half {
+                Self::Half::from_repr(self.to_repr().high())
+            }
+        })*
+    }
+}
+
+bool_low_high_impls! {
+    bool64fx4: bool64fx2;
+    bool32fx8: bool32fx4;
+
+    bool64ix4: bool64ix2;
+    bool32ix8: bool32ix4;
+    bool16ix16: bool16ix8;
+    bool8ix32: bool8ix16;
+}
+
+impl u64x4 {
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i64(self) -> i64x4 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to a 64-bit float.
+    #[inline]
+    pub fn to_f64(self) -> f64x4 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+impl i64x4 {
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u64(self) -> u64x4 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to a 64-bit float.
+    #[inline]
+    pub fn to_f64(self) -> f64x4 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+impl f64x4 {
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i64(self) -> i64x4 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u64(self) -> u64x4 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+impl u32x8 {
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i32(self) -> i32x8 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to a 32-bit float.
+    #[inline]
+    pub fn to_f32(self) -> f32x8 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+impl i32x8 {
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u32(self) -> u32x8 {
+        unsafe {simd_cast(self)}
+    }
+    /// Convert each lane to a 32-bit float.
+    #[inline]
+    pub fn to_f32(self) -> f32x8 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+impl i16x16 {
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u16(self) -> u16x16 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+impl u16x16 {
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i16(self) -> i16x16 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+impl i8x32 {
+    /// Convert each lane to an unsigned integer.
+    #[inline]
+    pub fn to_u8(self) -> u8x32 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+impl u8x32 {
+    /// Convert each lane to a signed integer.
+    #[inline]
+    pub fn to_i8(self) -> i8x32 {
+        unsafe {simd_cast(self)}
+    }
+}
+
+operators! {
+    Add (simd_add, add):
+        i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
+        f64x4, f32x8;
+    Sub (simd_sub, sub):
+        i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
+        f64x4, f32x8;
+    Mul (simd_mul, mul):
+        i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
+        f64x4, f32x8;
+    Div (simd_div, div): f64x4, f32x8;
+
+    BitAnd (simd_and, bitand):
+        i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
+        bool64ix4, bool32ix8, bool16ix16,
+        bool64fx4, bool32fx8;
+    BitOr (simd_or, bitor):
+        i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
+        bool64ix4, bool32ix8, bool16ix16,
+        bool64fx4, bool32fx8;
+    BitXor (simd_xor, bitxor):
+        i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
+        bool64ix4, bool32ix8, bool16ix16,
+        bool64fx4, bool32fx8;
+}
+
+neg_impls!{
+    0,
+    i64x4,
+    i32x8,
+    i16x16,
+    i8x32,
+}
+
+neg_impls! {
+    0.0,
+    f64x4,
+    f32x8,
+}
+
+not_impls! {
+    i64x4,
+    u64x4,
+    i32x8,
+    u32x8,
+    i16x16,
+    u16x16,
+    i8x32,
+    u8x32,
+}
+
+shift! {
+    i64x4,
+    u64x4,
+    i32x8,
+    u32x8,
+    i16x16,
+    u16x16,
+    i8x32,
+    u8x32
+}
--- a/third_party/rust/simd/src/x86/avx.rs
+++ b/third_party/rust/simd/src/x86/avx.rs
@ -0,0 +1,290 @@
+use super::super::*;
+use sixty_four::*;
+
+use super::super::bitcast;
+
+pub use v256::{
+    f64x4, bool64fx4, u64x4, i64x4, bool64ix4,
+    f32x8, bool32fx8, u32x8, i32x8, bool32ix8,
+    u16x16, i16x16, bool16ix16,
+    u8x32, i8x32, bool8ix32,
+    LowHigh128
+};
+
+#[allow(dead_code)]
+extern "platform-intrinsic" {
+    fn x86_mm256_addsub_ps(x: f32x8, y: f32x8) -> f32x8;
+    fn x86_mm256_addsub_pd(x: f64x4, y: f64x4) -> f64x4;
+    fn x86_mm256_dp_ps(x: f32x8, y: f32x8, z: i32) -> f32x8;
+    fn x86_mm256_hadd_ps(x: f32x8, y: f32x8) -> f32x8;
+    fn x86_mm256_hadd_pd(x: f64x4, y: f64x4) -> f64x4;
+    fn x86_mm256_hsub_ps(x: f32x8, y: f32x8) -> f32x8;
+    fn x86_mm256_hsub_pd(x: f64x4, y: f64x4) -> f64x4;
+    fn x86_mm256_max_ps(x: f32x8, y: f32x8) -> f32x8;
+    fn x86_mm256_max_pd(x: f64x4, y: f64x4) -> f64x4;
+    fn x86_mm256_min_ps(x: f32x8, y: f32x8) -> f32x8;
+    fn x86_mm256_min_pd(x: f64x4, y: f64x4) -> f64x4;
+    fn x86_mm256_movemask_ps(x: f32x8) -> i32;
+    fn x86_mm256_movemask_pd(x: f64x4) -> i32;
+    fn x86_mm_permutevar_ps(x: f32x4, y: i32x4) -> f32x4;
+    fn x86_mm_permutevar_pd(x: f64x2, y: i64x2) -> f64x2;
+    fn x86_mm256_permutevar_ps(x: f32x8, y: i32x8) -> f32x8;
+    fn x86_mm256_permutevar_pd(x: f64x4, y: i64x4) -> f64x4;
+    fn x86_mm256_rcp_ps(x: f32x8) -> f32x8;
+    fn x86_mm256_rsqrt_ps(x: f32x8) -> f32x8;
+    fn x86_mm256_sqrt_ps(x: f32x8) -> f32x8;
+    fn x86_mm256_sqrt_pd(x: f64x4) -> f64x4;
+    fn x86_mm_testc_ps(x: f32x4, y: f32x4) -> i32;
+    fn x86_mm256_testc_ps(x: f32x8, y: f32x8) -> i32;
+    fn x86_mm_testc_pd(x: f64x2, y: f64x2) -> i32;
+    fn x86_mm256_testc_pd(x: f64x4, y: f64x4) -> i32;
+    fn x86_mm256_testc_si256(x: u64x4, y: u64x4) -> i32;
+    fn x86_mm_testnzc_ps(x: f32x4, y: f32x4) -> i32;
+    fn x86_mm256_testnzc_ps(x: f32x8, y: f32x8) -> i32;
+    fn x86_mm_testnzc_pd(x: f64x2, y: f64x2) -> i32;
+    fn x86_mm256_testnzc_pd(x: f64x4, y: f64x4) -> i32;
+    fn x86_mm256_testnzc_si256(x: u64x4, y: u64x4) -> i32;
+    fn x86_mm_testz_ps(x: f32x4, y: f32x4) -> i32;
+    fn x86_mm256_testz_ps(x: f32x8, y: f32x8) -> i32;
+    fn x86_mm_testz_pd(x: f64x2, y: f64x2) -> i32;
+    fn x86_mm256_testz_pd(x: f64x4, y: f64x4) -> i32;
+    fn x86_mm256_testz_si256(x: u64x4, y: u64x4) -> i32;
+}
+
+#[doc(hidden)]
+pub mod common {
+    use super::*;
+    use std::mem;
+
+    macro_rules! bools {
+        ($($ty: ty, $all: ident, $any: ident, $testc: ident, $testz: ident;)*) => {
+            $(
+                #[inline]
+                pub fn $all(x: $ty) -> bool {
+                    unsafe {
+                        super::$testc(mem::transmute(x), mem::transmute(<$ty>::splat(true))) != 0
+                    }
+                }
+                #[inline]
+                pub fn $any(x: $ty) -> bool {
+                    unsafe {
+                        super::$testz(mem::transmute(x), mem::transmute(x)) == 0
+                    }
+                }
+                )*
+        }
+    }
+
+    bools! {
+        bool32fx8, bool32fx8_all, bool32fx8_any, x86_mm256_testc_ps, x86_mm256_testz_ps;
+        bool64fx4, bool64fx4_all, bool64fx4_any, x86_mm256_testc_pd, x86_mm256_testz_pd;
+        bool8ix32, bool8ix32_all, bool8ix32_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
+        bool16ix16, bool16ix16_all, bool16ix16_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
+        bool32ix8, bool32ix8_all, bool32ix8_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
+        bool64ix4, bool64ix4_all, bool64ix4_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
+    }
+}
+
+// 128-bit vectors:
+
+// 32 bit floats
+
+pub trait AvxF32x4 {
+    fn permutevar(self, other: i32x4) -> f32x4;
+}
+impl AvxF32x4 for f32x4 {
+    fn permutevar(self, other: i32x4) -> f32x4 {
+        unsafe { x86_mm_permutevar_ps(self, other) }
+    }
+}
+
+pub trait AvxF64x4 {
+    fn sqrt(self) -> Self;
+    fn addsub(self, other: Self) -> Self;
+    fn hadd(self, other: Self) -> Self;
+    fn hsub(self, other: Self) -> Self;
+    fn max(self, other: Self) -> Self;
+    fn min(self, other: Self) -> Self;
+    fn move_mask(self) -> u32;
+}
+
+impl AvxF64x4 for f64x4 {
+    #[inline]
+    fn sqrt(self) -> Self {
+        unsafe { x86_mm256_sqrt_pd(self) }
+    }
+
+    #[inline]
+    fn addsub(self, other: Self) -> Self {
+        unsafe { x86_mm256_addsub_pd(self, other) }
+    }
+
+    #[inline]
+    fn hadd(self, other: Self) -> Self {
+        unsafe { x86_mm256_hadd_pd(self, other) }
+    }
+
+    #[inline]
+    fn hsub(self, other: Self) -> Self {
+        unsafe { x86_mm256_hsub_pd(self, other) }
+    }
+
+    #[inline]
+    fn max(self, other: Self) -> Self {
+        unsafe { x86_mm256_max_pd(self, other) }
+    }
+
+    #[inline]
+    fn min(self, other: Self) -> Self {
+        unsafe { x86_mm256_min_pd(self, other) }
+    }
+
+    #[inline]
+    fn move_mask(self) -> u32 {
+        unsafe { x86_mm256_movemask_pd(self) as u32 }
+    }
+}
+
+pub trait AvxBool64fx4 {
+    fn move_mask(self) -> u32;
+}
+impl AvxBool64fx4 for bool64fx4 {
+    #[inline]
+    fn move_mask(self) -> u32 {
+        unsafe { x86_mm256_movemask_pd(bitcast(self)) as u32 }
+    }
+}
+
+pub trait AvxF32x8 {
+    fn sqrt(self) -> Self;
+    fn addsub(self, other: Self) -> Self;
+    fn hadd(self, other: Self) -> Self;
+    fn hsub(self, other: Self) -> Self;
+    fn max(self, other: Self) -> Self;
+    fn min(self, other: Self) -> Self;
+    fn move_mask(self) -> u32;
+    /// Compute an approximation to the reciprocal of the square root
+    /// of `self`, that is, `f32x8::splat(1.0) / self.sqrt()`.
+    ///
+    /// The accuracy of this approximation is platform dependent.
+    fn approx_rsqrt(self) -> Self;
+    /// Compute an approximation to the reciprocal of `self`, that is,
+    /// `f32x8::splat(1.0) / self`.
+    ///
+    /// The accuracy of this approximation is platform dependent.
+    fn approx_reciprocal(self) -> Self;
+}
+
+impl AvxF32x8 for f32x8 {
+    #[inline]
+    fn sqrt(self) -> Self {
+        unsafe { x86_mm256_sqrt_ps(self) }
+    }
+
+    #[inline]
+    fn addsub(self, other: Self) -> Self {
+        unsafe { x86_mm256_addsub_ps(self, other) }
+    }
+
+    #[inline]
+    fn hadd(self, other: Self) -> Self {
+        unsafe { x86_mm256_hadd_ps(self, other) }
+    }
+
+    #[inline]
+    fn hsub(self, other: Self) -> Self {
+        unsafe { x86_mm256_hsub_ps(self, other) }
+    }
+
+    #[inline]
+    fn max(self, other: Self) -> Self {
+        unsafe { x86_mm256_max_ps(self, other) }
+    }
+
+    #[inline]
+    fn min(self, other: Self) -> Self {
+        unsafe { x86_mm256_min_ps(self, other) }
+    }
+
+    #[inline]
+    fn move_mask(self) -> u32 {
+        unsafe { x86_mm256_movemask_ps(self) as u32 }
+    }
+
+    #[inline]
+    fn approx_reciprocal(self) -> Self {
+        unsafe { x86_mm256_rcp_ps(self) }
+    }
+
+    #[inline]
+    fn approx_rsqrt(self) -> Self {
+        unsafe { x86_mm256_rsqrt_ps(self) }
+    }
+}
+
+pub trait AvxBool32fx8 {
+    fn move_mask(self) -> u32;
+}
+impl AvxBool32fx8 for bool32fx8 {
+    #[inline]
+    fn move_mask(self) -> u32 {
+        unsafe { x86_mm256_movemask_ps(bitcast(self)) as u32 }
+    }
+}
+
+pub trait AvxBool32fx4 {}
+impl AvxBool32fx4 for bool32fx4 {}
+
+// 64 bit floats
+
+pub trait AvxF64x2 {
+    fn permutevar(self, other: i64x2) -> f64x2;
+}
+impl AvxF64x2 for f64x2 {
+    fn permutevar(self, other: i64x2) -> f64x2 {
+        unsafe { x86_mm_permutevar_pd(self, other) }
+    }
+}
+
+pub trait AvxBool64fx2 {}
+impl AvxBool64fx2 for bool64fx2 {}
+
+// 64 bit integers
+
+pub trait AvxU64x2 {}
+impl AvxU64x2 for u64x2 {}
+pub trait AvxI64x2 {}
+impl AvxI64x2 for i64x2 {}
+
+pub trait AvxBool64ix2 {}
+impl AvxBool64ix2 for bool64ix2 {}
+
+// 32 bit integers
+
+pub trait AvxU32x4 {}
+impl AvxU32x4 for u32x4 {}
+pub trait AvxI32x4 {}
+impl AvxI32x4 for i32x4 {}
+
+pub trait AvxBool32ix4 {}
+impl AvxBool32ix4 for bool32ix4 {}
+
+// 16 bit integers
+
+pub trait AvxU16x8 {}
+impl AvxU16x8 for u16x8 {}
+pub trait AvxI16x8 {}
+impl AvxI16x8 for i16x8 {}
+
+pub trait AvxBool16ix8 {}
+impl AvxBool16ix8 for bool16ix8 {}
+
+// 8 bit integers
+
+pub trait AvxU8x16 {}
+impl AvxU8x16 for u8x16 {}
+pub trait AvxI8x16 {}
+impl AvxI8x16 for i8x16 {}
+
+pub trait AvxBool8ix16 {}
+impl AvxBool8ix16 for bool8ix16 {}
--- a/third_party/rust/simd/src/x86/avx2.rs
+++ b/third_party/rust/simd/src/x86/avx2.rs
@ -0,0 +1,65 @@
+use x86::avx::*;
+
+#[allow(dead_code)]
+extern "platform-intrinsic" {
+    fn x86_mm256_abs_epi8(x: i8x32) -> i8x32;
+    fn x86_mm256_abs_epi16(x: i16x16) -> i16x16;
+    fn x86_mm256_abs_epi32(x: i32x8) -> i32x8;
+    fn x86_mm256_adds_epi8(x: i8x32, y: i8x32) -> i8x32;
+    fn x86_mm256_adds_epu8(x: u8x32, y: u8x32) -> u8x32;
+    fn x86_mm256_adds_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_adds_epu16(x: u16x16, y: u16x16) -> u16x16;
+    fn x86_mm256_avg_epu8(x: u8x32, y: u8x32) -> u8x32;
+    fn x86_mm256_avg_epu16(x: u16x16, y: u16x16) -> u16x16;
+    fn x86_mm256_hadd_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_hadd_epi32(x: i32x8, y: i32x8) -> i32x8;
+    fn x86_mm256_hadds_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_hsub_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_hsub_epi32(x: i32x8, y: i32x8) -> i32x8;
+    fn x86_mm256_hsubs_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_madd_epi16(x: i16x16, y: i16x16) -> i32x8;
+    fn x86_mm256_maddubs_epi16(x: i8x32, y: i8x32) -> i16x16;
+    fn x86_mm256_max_epi8(x: i8x32, y: i8x32) -> i8x32;
+    fn x86_mm256_max_epu8(x: u8x32, y: u8x32) -> u8x32;
+    fn x86_mm256_max_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_max_epu16(x: u16x16, y: u16x16) -> u16x16;
+    fn x86_mm256_max_epi32(x: i32x8, y: i32x8) -> i32x8;
+    fn x86_mm256_max_epu32(x: u32x8, y: u32x8) -> u32x8;
+    fn x86_mm256_min_epi8(x: i8x32, y: i8x32) -> i8x32;
+    fn x86_mm256_min_epu8(x: u8x32, y: u8x32) -> u8x32;
+    fn x86_mm256_min_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_min_epu16(x: u16x16, y: u16x16) -> u16x16;
+    fn x86_mm256_min_epi32(x: i32x8, y: i32x8) -> i32x8;
+    fn x86_mm256_min_epu32(x: u32x8, y: u32x8) -> u32x8;
+    fn x86_mm256_mul_epi64(x: i32x8, y: i32x8) -> i64x4;
+    fn x86_mm256_mul_epu64(x: u32x8, y: u32x8) -> u64x4;
+    fn x86_mm256_mulhi_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_mulhi_epu16(x: u16x16, y: u16x16) -> u16x16;
+    fn x86_mm256_mulhrs_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_packs_epi16(x: i16x16, y: i16x16) -> i8x32;
+    fn x86_mm256_packus_epi16(x: i16x16, y: i16x16) -> u8x32;
+    fn x86_mm256_packs_epi32(x: i32x8, y: i32x8) -> i16x16;
+    fn x86_mm256_packus_epi32(x: i32x8, y: i32x8) -> u16x16;
+    fn x86_mm256_permutevar8x32_epi32(x: i32x8, y: i32x8) -> i32x8;
+    fn x86_mm256_permutevar8x32_ps(x: f32x8, y: i32x8) -> f32x8;
+    fn x86_mm256_sad_epu8(x: u8x32, y: u8x32) -> u8x32;
+    fn x86_mm256_shuffle_epi8(x: i8x32, y: i8x32) -> i8x32;
+    fn x86_mm256_sign_epi8(x: i8x32, y: i8x32) -> i8x32;
+    fn x86_mm256_sign_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_sign_epi32(x: i32x8, y: i32x8) -> i32x8;
+    fn x86_mm256_subs_epi8(x: i8x32, y: i8x32) -> i8x32;
+    fn x86_mm256_subs_epu8(x: u8x32, y: u8x32) -> u8x32;
+    fn x86_mm256_subs_epi16(x: i16x16, y: i16x16) -> i16x16;
+    fn x86_mm256_subs_epu16(x: u16x16, y: u16x16) -> u16x16;
+}
+
+// broken on rustc 1.7.0-nightly (1ddaf8bdf 2015-12-12)
+// pub trait Avx2F32x8 {
+//     fn permutevar(self, other: i32x8) -> f32x8;
+// }
+//
+// impl Avx2F32x8 for f32x8 {
+//     fn permutevar(self, other: i32x8) -> f32x8 {
+//         unsafe { x86_mm256_permutevar8x32_ps(self, other) }
+//     }
+// }
--- a/third_party/rust/simd/src/x86/mod.rs
+++ b/third_party/rust/simd/src/x86/mod.rs
@ -0,0 +1,16 @@
+//! Features specific to x86 and x86-64 CPUs.
+
+#[cfg(any(feature = "doc", target_feature = "sse2"))]
+pub mod sse2;
+#[cfg(any(feature = "doc", target_feature = "sse3"))]
+pub mod sse3;
+#[cfg(any(feature = "doc", target_feature = "ssse3"))]
+pub mod ssse3;
+#[cfg(any(feature = "doc", target_feature = "sse4.1"))]
+pub mod sse4_1;
+#[cfg(any(feature = "doc", target_feature = "sse4.2"))]
+pub mod sse4_2;
+#[cfg(any(feature = "doc", target_feature = "avx"))]
+pub mod avx;
+#[cfg(any(feature = "doc", target_feature = "avx2"))]
+pub mod avx2;
--- a/third_party/rust/simd/src/x86/sse2.rs
+++ b/third_party/rust/simd/src/x86/sse2.rs
@ -0,0 +1,359 @@
+use super::super::*;
+use {bitcast, simd_cast, f32x2};
+
+pub use sixty_four::{f64x2, i64x2, u64x2, bool64ix2, bool64fx2};
+
+//pub use super::{u64x2, i64x2, f64x2, bool64ix2, bool64fx2};
+
+// strictly speaking, these are SSE instructions, not SSE2.
+extern "platform-intrinsic" {
+    fn x86_mm_movemask_ps(x: f32x4) -> i32;
+    fn x86_mm_max_ps(x: f32x4, y: f32x4) -> f32x4;
+    fn x86_mm_min_ps(x: f32x4, y: f32x4) -> f32x4;
+    fn x86_mm_rsqrt_ps(x: f32x4) -> f32x4;
+    fn x86_mm_rcp_ps(x: f32x4) -> f32x4;
+    fn x86_mm_sqrt_ps(x: f32x4) -> f32x4;
+}
+
+extern "platform-intrinsic" {
+    fn x86_mm_adds_epi8(x: i8x16, y: i8x16) -> i8x16;
+    fn x86_mm_adds_epu8(x: u8x16, y: u8x16) -> u8x16;
+    fn x86_mm_adds_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_adds_epu16(x: u16x8, y: u16x8) -> u16x8;
+    fn x86_mm_avg_epu8(x: u8x16, y: u8x16) -> u8x16;
+    fn x86_mm_avg_epu16(x: u16x8, y: u16x8) -> u16x8;
+    fn x86_mm_madd_epi16(x: i16x8, y: i16x8) -> i32x4;
+    fn x86_mm_max_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_max_epu8(x: u8x16, y: u8x16) -> u8x16;
+    fn x86_mm_max_pd(x: f64x2, y: f64x2) -> f64x2;
+    fn x86_mm_min_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_min_epu8(x: u8x16, y: u8x16) -> u8x16;
+    fn x86_mm_min_pd(x: f64x2, y: f64x2) -> f64x2;
+    fn x86_mm_movemask_pd(x: f64x2) -> i32;
+    fn x86_mm_movemask_epi8(x: i8x16) -> i32;
+    fn x86_mm_mul_epu32(x: u32x4, y: u32x4) -> u64x2;
+    fn x86_mm_mulhi_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_mulhi_epu16(x: u16x8, y: u16x8) -> u16x8;
+    fn x86_mm_packs_epi16(x: i16x8, y: i16x8) -> i8x16;
+    fn x86_mm_packs_epi32(x: i32x4, y: i32x4) -> i16x8;
+    fn x86_mm_packus_epi16(x: i16x8, y: i16x8) -> u8x16;
+    fn x86_mm_sad_epu8(x: u8x16, y: u8x16) -> u64x2;
+    fn x86_mm_sqrt_pd(x: f64x2) -> f64x2;
+    fn x86_mm_subs_epi8(x: i8x16, y: i8x16) -> i8x16;
+    fn x86_mm_subs_epu8(x: u8x16, y: u8x16) -> u8x16;
+    fn x86_mm_subs_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_subs_epu16(x: u16x8, y: u16x8) -> u16x8;
+}
+
+#[doc(hidden)]
+pub mod common {
+    use super::super::super::*;
+    use std::mem;
+
+    #[inline]
+    pub fn f32x4_sqrt(x: f32x4) -> f32x4 {
+        unsafe {super::x86_mm_sqrt_ps(x)}
+    }
+    #[inline]
+    pub fn f32x4_approx_rsqrt(x: f32x4) -> f32x4 {
+        unsafe {super::x86_mm_rsqrt_ps(x)}
+    }
+    #[inline]
+    pub fn f32x4_approx_reciprocal(x: f32x4) -> f32x4 {
+        unsafe {super::x86_mm_rcp_ps(x)}
+    }
+    #[inline]
+    pub fn f32x4_max(x: f32x4, y: f32x4) -> f32x4 {
+        unsafe {super::x86_mm_max_ps(x, y)}
+    }
+    #[inline]
+    pub fn f32x4_min(x: f32x4, y: f32x4) -> f32x4 {
+        unsafe {super::x86_mm_min_ps(x, y)}
+    }
+
+    macro_rules! bools {
+        ($($ty: ty, $all: ident, $any: ident, $movemask: ident, $width: expr;)*) => {
+            $(
+                #[inline]
+                pub fn $all(x: $ty) -> bool {
+                    unsafe {
+                        super::$movemask(mem::transmute(x)) == (1 << $width) - 1
+                    }
+                }
+                #[inline]
+                pub fn $any(x: $ty) -> bool {
+                    unsafe {
+                        super::$movemask(mem::transmute(x)) != 0
+                    }
+                }
+                )*
+        }
+    }
+
+    bools! {
+        bool32fx4, bool32fx4_all, bool32fx4_any, x86_mm_movemask_ps, 4;
+        bool8ix16, bool8ix16_all, bool8ix16_any, x86_mm_movemask_epi8, 16;
+        bool16ix8, bool16ix8_all, bool16ix8_any, x86_mm_movemask_epi8, 16;
+        bool32ix4, bool32ix4_all, bool32ix4_any, x86_mm_movemask_epi8, 16;
+    }
+}
+
+// 32 bit floats
+
+pub trait Sse2F32x4 {
+    fn to_f64(self) -> f64x2;
+    fn move_mask(self) -> u32;
+}
+impl Sse2F32x4 for f32x4 {
+    #[inline]
+    fn to_f64(self) -> f64x2 {
+        unsafe {
+            simd_cast(f32x2(self.0, self.1))
+        }
+    }
+    fn move_mask(self) -> u32 {
+        unsafe {x86_mm_movemask_ps(self) as u32}
+    }
+}
+pub trait Sse2Bool32fx4 {
+    fn move_mask(self) -> u32;
+}
+impl Sse2Bool32fx4 for bool32fx4 {
+    #[inline]
+    fn move_mask(self) -> u32 {
+        unsafe { x86_mm_movemask_ps(bitcast(self)) as u32}
+    }
+}
+
+// 64 bit floats
+
+pub trait Sse2F64x2 {
+    fn move_mask(self) -> u32;
+    fn sqrt(self) -> Self;
+    fn max(self, other: Self) -> Self;
+    fn min(self, other: Self) -> Self;
+}
+impl Sse2F64x2 for f64x2 {
+    #[inline]
+    fn move_mask(self) -> u32 {
+        unsafe { x86_mm_movemask_pd(bitcast(self)) as u32}
+    }
+
+    #[inline]
+    fn sqrt(self) -> Self {
+        unsafe { x86_mm_sqrt_pd(self) }
+    }
+
+    #[inline]
+    fn max(self, other: Self) -> Self {
+        unsafe { x86_mm_max_pd(self, other) }
+    }
+    #[inline]
+    fn min(self, other: Self) -> Self {
+        unsafe { x86_mm_min_pd(self, other) }
+    }
+}
+
+pub trait Sse2Bool64fx2 {
+    fn move_mask(self) -> u32;
+}
+impl Sse2Bool64fx2 for bool64fx2 {
+    #[inline]
+    fn move_mask(self) -> u32 {
+        unsafe { x86_mm_movemask_pd(bitcast(self)) as u32}
+    }
+}
+
+// 64 bit ints
+
+pub trait Sse2U64x2 {}
+impl Sse2U64x2 for u64x2 {}
+
+pub trait Sse2I64x2 {}
+impl Sse2I64x2 for i64x2 {}
+
+pub trait Sse2Bool64ix2 {}
+impl Sse2Bool64ix2 for bool64ix2 {}
+
+// 32 bit ints
+
+pub trait Sse2U32x4 {
+    fn low_mul(self, other: Self) -> u64x2;
+}
+impl Sse2U32x4 for u32x4 {
+    #[inline]
+    fn low_mul(self, other: Self) -> u64x2 {
+        unsafe { x86_mm_mul_epu32(self, other) }
+    }
+}
+
+pub trait Sse2I32x4 {
+    fn packs(self, other: Self) -> i16x8;
+}
+impl Sse2I32x4 for i32x4 {
+    #[inline]
+    fn packs(self, other: Self) -> i16x8 {
+        unsafe { x86_mm_packs_epi32(self, other) }
+    }
+}
+
+pub trait Sse2Bool32ix4 {}
+impl Sse2Bool32ix4 for bool32ix4 {}
+
+// 16 bit ints
+
+pub trait Sse2U16x8 {
+    fn adds(self, other: Self) -> Self;
+    fn subs(self, other: Self) -> Self;
+    fn avg(self, other: Self) -> Self;
+    fn mulhi(self, other: Self) -> Self;
+}
+impl Sse2U16x8 for u16x8 {
+    #[inline]
+    fn adds(self, other: Self) -> Self {
+        unsafe { x86_mm_adds_epu16(self, other) }
+    }
+    #[inline]
+    fn subs(self, other: Self) -> Self {
+        unsafe { x86_mm_subs_epu16(self, other) }
+    }
+
+    #[inline]
+    fn avg(self, other: Self) -> Self {
+        unsafe { x86_mm_avg_epu16(self, other) }
+    }
+
+    #[inline]
+    fn mulhi(self, other: Self) -> Self {
+        unsafe { x86_mm_mulhi_epu16(self, other) }
+    }
+}
+
+pub trait Sse2I16x8 {
+    fn adds(self, other: Self) -> Self;
+    fn subs(self, other: Self) -> Self;
+    fn madd(self, other: Self) -> i32x4;
+    fn max(self, other: Self) -> Self;
+    fn min(self, other: Self) -> Self;
+    fn mulhi(self, other: Self) -> Self;
+    fn packs(self, other: Self) -> i8x16;
+    fn packus(self, other: Self) -> u8x16;
+}
+impl Sse2I16x8 for i16x8 {
+    #[inline]
+    fn adds(self, other: Self) -> Self {
+        unsafe { x86_mm_adds_epi16(self, other) }
+    }
+    #[inline]
+    fn subs(self, other: Self) -> Self {
+        unsafe { x86_mm_subs_epi16(self, other) }
+    }
+
+    #[inline]
+    fn madd(self, other: Self) -> i32x4 {
+        unsafe { x86_mm_madd_epi16(self, other) }
+    }
+
+    #[inline]
+    fn max(self, other: Self) -> Self {
+        unsafe { x86_mm_max_epi16(self, other) }
+    }
+    #[inline]
+    fn min(self, other: Self) -> Self {
+        unsafe { x86_mm_min_epi16(self, other) }
+    }
+
+    #[inline]
+    fn mulhi(self, other: Self) -> Self {
+        unsafe { x86_mm_mulhi_epi16(self, other) }
+    }
+
+    #[inline]
+    fn packs(self, other: Self) -> i8x16 {
+        unsafe { x86_mm_packs_epi16(self, other) }
+    }
+    #[inline]
+    fn packus(self, other: Self) -> u8x16 {
+        unsafe { x86_mm_packus_epi16(self, other) }
+    }
+}
+
+pub trait Sse2Bool16ix8 {}
+impl Sse2Bool16ix8 for bool16ix8 {}
+
+// 8 bit ints
+
+pub trait Sse2U8x16 {
+    fn move_mask(self) -> u32;
+    fn adds(self, other: Self) -> Self;
+    fn subs(self, other: Self) -> Self;
+    fn avg(self, other: Self) -> Self;
+    fn max(self, other: Self) -> Self;
+    fn min(self, other: Self) -> Self;
+    fn sad(self, other: Self) -> u64x2;
+}
+impl Sse2U8x16 for u8x16 {
+    #[inline]
+    fn move_mask(self) -> u32 {
+        unsafe { x86_mm_movemask_epi8(bitcast(self)) as u32}
+    }
+
+    #[inline]
+    fn adds(self, other: Self) -> Self {
+        unsafe { x86_mm_adds_epu8(self, other) }
+    }
+    #[inline]
+    fn subs(self, other: Self) -> Self {
+        unsafe { x86_mm_subs_epu8(self, other) }
+    }
+
+    #[inline]
+    fn avg(self, other: Self) -> Self {
+        unsafe { x86_mm_avg_epu8(self, other) }
+    }
+
+    #[inline]
+    fn max(self, other: Self) -> Self {
+        unsafe { x86_mm_max_epu8(self, other) }
+    }
+    #[inline]
+    fn min(self, other: Self) -> Self {
+        unsafe { x86_mm_min_epu8(self, other) }
+    }
+
+    #[inline]
+    fn sad(self, other: Self) -> u64x2 {
+        unsafe { x86_mm_sad_epu8(self, other) }
+    }
+}
+
+pub trait Sse2I8x16 {
+    fn move_mask(self) -> u32;
+    fn adds(self, other: Self) -> Self;
+    fn subs(self, other: Self) -> Self;
+}
+impl Sse2I8x16 for i8x16 {
+    #[inline]
+    fn move_mask(self) -> u32 {
+        unsafe { x86_mm_movemask_epi8(bitcast(self)) as u32}
+    }
+
+    #[inline]
+    fn adds(self, other: Self) -> Self {
+        unsafe { x86_mm_adds_epi8(self, other) }
+    }
+    #[inline]
+    fn subs(self, other: Self) -> Self {
+        unsafe { x86_mm_subs_epi8(self, other) }
+    }
+}
+
+pub trait Sse2Bool8ix16 {
+    fn move_mask(self) -> u32;
+}
+impl Sse2Bool8ix16 for bool8ix16 {
+    #[inline]
+    fn move_mask(self) -> u32 {
+        unsafe { x86_mm_movemask_epi8(bitcast(self)) as u32}
+    }
+}
--- a/third_party/rust/simd/src/x86/sse3.rs
+++ b/third_party/rust/simd/src/x86/sse3.rs
@ -0,0 +1,57 @@
+use sixty_four::*;
+use super::super::*;
+
+extern "platform-intrinsic" {
+    fn x86_mm_addsub_ps(x: f32x4, y: f32x4) -> f32x4;
+    fn x86_mm_addsub_pd(x: f64x2, y: f64x2) -> f64x2;
+    fn x86_mm_hadd_ps(x: f32x4, y: f32x4) -> f32x4;
+    fn x86_mm_hadd_pd(x: f64x2, y: f64x2) -> f64x2;
+    fn x86_mm_hsub_ps(x: f32x4, y: f32x4) -> f32x4;
+    fn x86_mm_hsub_pd(x: f64x2, y: f64x2) -> f64x2;
+}
+
+pub trait Sse3F32x4 {
+    fn addsub(self, other: Self) -> Self;
+    fn hadd(self, other: Self) -> Self;
+    fn hsub(self, other: Self) -> Self;
+}
+
+impl Sse3F32x4 for f32x4 {
+    #[inline]
+    fn addsub(self, other: Self) -> Self {
+        unsafe { x86_mm_addsub_ps(self, other) }
+    }
+
+    #[inline]
+    fn hadd(self, other: Self) -> Self {
+        unsafe { x86_mm_hadd_ps(self, other) }
+    }
+
+    #[inline]
+    fn hsub(self, other: Self) -> Self {
+        unsafe { x86_mm_hsub_ps(self, other) }
+    }
+}
+
+pub trait Sse3F64x2 {
+    fn addsub(self, other: Self) -> Self;
+    fn hadd(self, other: Self) -> Self;
+    fn hsub(self, other: Self) -> Self;
+}
+
+impl Sse3F64x2 for f64x2 {
+    #[inline]
+    fn addsub(self, other: Self) -> Self {
+        unsafe { x86_mm_addsub_pd(self, other) }
+    }
+
+    #[inline]
+    fn hadd(self, other: Self) -> Self {
+        unsafe { x86_mm_hadd_pd(self, other) }
+    }
+
+    #[inline]
+    fn hsub(self, other: Self) -> Self {
+        unsafe { x86_mm_hsub_pd(self, other) }
+    }
+}
--- a/third_party/rust/simd/src/x86/sse4_1.rs
+++ b/third_party/rust/simd/src/x86/sse4_1.rs
@ -0,0 +1,155 @@
+use super::super::*;
+use x86::sse2::*;
+
+#[allow(dead_code)]
+extern "platform-intrinsic" {
+    fn x86_mm_dp_ps(x: f32x4, y: f32x4, z: i32) -> f32x4;
+    fn x86_mm_dp_pd(x: f64x2, y: f64x2, z: i32) -> f64x2;
+    fn x86_mm_max_epi8(x: i8x16, y: i8x16) -> i8x16;
+    fn x86_mm_max_epu16(x: u16x8, y: u16x8) -> u16x8;
+    fn x86_mm_max_epi32(x: i32x4, y: i32x4) -> i32x4;
+    fn x86_mm_max_epu32(x: u32x4, y: u32x4) -> u32x4;
+    fn x86_mm_min_epi8(x: i8x16, y: i8x16) -> i8x16;
+    fn x86_mm_min_epu16(x: u16x8, y: u16x8) -> u16x8;
+    fn x86_mm_min_epi32(x: i32x4, y: i32x4) -> i32x4;
+    fn x86_mm_min_epu32(x: u32x4, y: u32x4) -> u32x4;
+    fn x86_mm_minpos_epu16(x: u16x8) -> u16x8;
+    fn x86_mm_mpsadbw_epu8(x: u8x16, y: u8x16, z: i32) -> u16x8;
+    fn x86_mm_mul_epi32(x: i32x4, y: i32x4) -> i64x2;
+    fn x86_mm_packus_epi32(x: i32x4, y: i32x4) -> u16x8;
+    fn x86_mm_testc_si128(x: u64x2, y: u64x2) -> i32;
+    fn x86_mm_testnzc_si128(x: u64x2, y: u64x2) -> i32;
+    fn x86_mm_testz_si128(x: u64x2, y: u64x2) -> i32;
+}
+
+// 32 bit floats
+
+pub trait Sse41F32x4 {}
+impl Sse41F32x4 for f32x4 {}
+
+// 64 bit floats
+
+pub trait Sse41F64x2 {}
+impl Sse41F64x2 for f64x2 {}
+
+// 64 bit integers
+
+pub trait Sse41U64x2 {
+    fn testc(self, other: Self) -> i32;
+    fn testnzc(self, other: Self) -> i32;
+    fn testz(self, other: Self) -> i32;
+}
+impl Sse41U64x2 for u64x2 {
+    #[inline]
+    fn testc(self, other: Self) -> i32 {
+        unsafe { x86_mm_testc_si128(self, other) }
+    }
+    #[inline]
+    fn testnzc(self, other: Self) -> i32 {
+        unsafe { x86_mm_testnzc_si128(self, other) }
+    }
+    #[inline]
+    fn testz(self, other: Self) -> i32 {
+        unsafe { x86_mm_testz_si128(self, other) }
+    }
+}
+pub trait Sse41I64x2 {}
+impl Sse41I64x2 for i64x2 {}
+
+pub trait Sse41Bool64ix2 {}
+impl Sse41Bool64ix2 for bool64ix2 {}
+
+// 32 bit integers
+
+pub trait Sse41U32x4 {
+    fn max(self, other: Self) -> Self;
+    fn min(self, other: Self) -> Self;
+}
+impl Sse41U32x4 for u32x4 {
+    #[inline]
+    fn max(self, other: Self) -> Self {
+        unsafe { x86_mm_max_epu32(self, other) }
+    }
+    #[inline]
+    fn min(self, other: Self) -> Self {
+        unsafe { x86_mm_min_epu32(self, other) }
+    }
+}
+pub trait Sse41I32x4 {
+    fn max(self, other: Self) -> Self;
+    fn min(self, other: Self) -> Self;
+    fn low_mul(self, other: Self) -> i64x2;
+    fn packus(self, other: Self) -> u16x8;
+}
+impl Sse41I32x4 for i32x4 {
+    #[inline]
+    fn max(self, other: Self) -> Self {
+        unsafe { x86_mm_max_epi32(self, other) }
+    }
+    #[inline]
+    fn min(self, other: Self) -> Self {
+        unsafe { x86_mm_min_epi32(self, other) }
+    }
+
+    #[inline]
+    fn low_mul(self, other: Self) -> i64x2 {
+        unsafe { x86_mm_mul_epi32(self, other) }
+    }
+    #[inline]
+    fn packus(self, other: Self) -> u16x8 {
+        unsafe { x86_mm_packus_epi32(self, other) }
+    }
+}
+
+pub trait Sse41Bool32ix4 {}
+impl Sse41Bool32ix4 for bool32ix4 {}
+
+// 16 bit integers
+
+pub trait Sse41U16x8 {
+    fn max(self, other: Self) -> Self;
+    fn min(self, other: Self) -> Self;
+    fn minpos(self) -> Self;
+}
+impl Sse41U16x8 for u16x8 {
+    #[inline]
+    fn max(self, other: Self) -> Self {
+        unsafe { x86_mm_max_epu16(self, other) }
+    }
+    #[inline]
+    fn min(self, other: Self) -> Self {
+        unsafe { x86_mm_min_epu16(self, other) }
+    }
+
+    #[inline]
+    fn minpos(self) -> Self {
+        unsafe { x86_mm_minpos_epu16(self) }
+    }
+}
+pub trait Sse41I16x8 {}
+impl Sse41I16x8 for i16x8 {}
+
+pub trait Sse41Bool16ix8 {}
+impl Sse41Bool16ix8 for bool16ix8 {}
+
+// 8 bit integers
+
+pub trait Sse41U8x16 {}
+impl Sse41U8x16 for u8x16 {}
+pub trait Sse41I8x16 {
+    fn max(self, other: Self) -> Self;
+    fn min(self, other: Self) -> Self;
+}
+impl Sse41I8x16 for i8x16 {
+    #[inline]
+    fn max(self, other: Self) -> Self {
+        unsafe { x86_mm_max_epi8(self, other) }
+    }
+    #[inline]
+    fn min(self, other: Self) -> Self {
+        unsafe { x86_mm_min_epi8(self, other) }
+    }
+}
+
+pub trait Sse41Bool8ix16 {}
+impl Sse41Bool8ix16 for bool8ix16 {}
--- a/third_party/rust/simd/src/x86/sse4_2.rs
+++ b/third_party/rust/simd/src/x86/sse4_2.rs
@ -0,0 +1,19 @@
+use i8x16;
+
+#[allow(dead_code)]
+extern "platform-intrinsic" {
+    fn x86_mm_cmpestra(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
+    fn x86_mm_cmpestrc(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
+    fn x86_mm_cmpestri(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
+    fn x86_mm_cmpestrm(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i8x16;
+    fn x86_mm_cmpestro(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
+    fn x86_mm_cmpestrs(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
+    fn x86_mm_cmpestrz(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
+    fn x86_mm_cmpistra(x: i8x16, y: i8x16, z: i32) -> i32;
+    fn x86_mm_cmpistrc(x: i8x16, y: i8x16, z: i32) -> i32;
+    fn x86_mm_cmpistri(x: i8x16, y: i8x16, z: i32) -> i32;
+    fn x86_mm_cmpistrm(x: i8x16, y: i8x16, z: i32) -> i8x16;
+    fn x86_mm_cmpistro(x: i8x16, y: i8x16, z: i32) -> i32;
+    fn x86_mm_cmpistrs(x: i8x16, y: i8x16, z: i32) -> i32;
+    fn x86_mm_cmpistrz(x: i8x16, y: i8x16, z: i32) -> i32;
+}
--- a/third_party/rust/simd/src/x86/ssse3.rs
+++ b/third_party/rust/simd/src/x86/ssse3.rs
@ -0,0 +1,172 @@
+use super::super::*;
+use bitcast;
+
+macro_rules! bitcast {
+    ($func: ident($a: ident, $b: ident)) => {
+        bitcast($func(bitcast($a), bitcast($b)))
+    }
+}
+
+extern "platform-intrinsic" {
+    fn x86_mm_abs_epi8(x: i8x16) -> i8x16;
+    fn x86_mm_abs_epi16(x: i16x8) -> i16x8;
+    fn x86_mm_abs_epi32(x: i32x4) -> i32x4;
+    fn x86_mm_hadd_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_hadd_epi32(x: i32x4, y: i32x4) -> i32x4;
+    fn x86_mm_hadds_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_hsub_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_hsub_epi32(x: i32x4, y: i32x4) -> i32x4;
+    fn x86_mm_hsubs_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_maddubs_epi16(x: u8x16, y: i8x16) -> i16x8;
+    fn x86_mm_mulhrs_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_shuffle_epi8(x: i8x16, y: i8x16) -> i8x16;
+    fn x86_mm_sign_epi8(x: i8x16, y: i8x16) -> i8x16;
+    fn x86_mm_sign_epi16(x: i16x8, y: i16x8) -> i16x8;
+    fn x86_mm_sign_epi32(x: i32x4, y: i32x4) -> i32x4;
+}
+
+// 32 bit integers
+
+pub trait Ssse3I32x4 {
+    fn abs(self) -> Self;
+    fn hadd(self, other: Self) -> Self;
+    fn hsub(self, other: Self) -> Self;
+    fn sign(self, other: Self) -> Self;
+}
+impl Ssse3I32x4 for i32x4 {
+    #[inline]
+    fn abs(self) -> Self {
+        unsafe { x86_mm_abs_epi32(self) }
+    }
+
+    #[inline]
+    fn hadd(self, other: Self) -> Self {
+        unsafe { x86_mm_hadd_epi32(self, other) }
+    }
+    #[inline]
+    fn hsub(self, other: Self) -> Self {
+        unsafe { x86_mm_hsub_epi32(self, other) }
+    }
+
+    #[inline]
+    fn sign(self, other: Self) -> Self {
+        unsafe { x86_mm_sign_epi32(self, other) }
+    }
+}
+
+pub trait Ssse3U32x4 {
+    fn hadd(self, other: Self) -> Self;
+    fn hsub(self, other: Self) -> Self;
+}
+impl Ssse3U32x4 for u32x4 {
+    #[inline]
+    fn hadd(self, other: Self) -> Self {
+        unsafe { bitcast!(x86_mm_hadd_epi32(self, other)) }
+    }
+    #[inline]
+    fn hsub(self, other: Self) -> Self {
+        unsafe { bitcast!(x86_mm_hsub_epi32(self, other)) }
+    }
+}
+
+// 16 bit integers
+
+pub trait Ssse3I16x8 {
+    fn abs(self) -> Self;
+    fn hadd(self, other: Self) -> Self;
+    fn hadds(self, other: Self) -> Self;
+    fn hsub(self, other: Self) -> Self;
+    fn hsubs(self, other: Self) -> Self;
+    fn sign(self, other: Self) -> Self;
+    fn mulhrs(self, other: Self) -> Self;
+}
+impl Ssse3I16x8 for i16x8 {
+    #[inline]
+    fn abs(self) -> Self {
+        unsafe { x86_mm_abs_epi16(self) }
+    }
+
+    #[inline]
+    fn hadd(self, other: Self) -> Self {
+        unsafe { x86_mm_hadd_epi16(self, other) }
+    }
+    #[inline]
+    fn hadds(self, other: Self) -> Self {
+        unsafe { x86_mm_hadds_epi16(self, other) }
+    }
+    #[inline]
+    fn hsub(self, other: Self) -> Self {
+        unsafe { x86_mm_hsub_epi16(self, other) }
+    }
+    #[inline]
+    fn hsubs(self, other: Self) -> Self {
+        unsafe { x86_mm_hsubs_epi16(self, other) }
+    }
+
+    #[inline]
+    fn sign(self, other: Self) -> Self {
+        unsafe { x86_mm_sign_epi16(self, other) }
+    }
+
+    #[inline]
+    fn mulhrs(self, other: Self) -> Self {
+        unsafe { x86_mm_mulhrs_epi16(self, other) }
+    }
+}
+
+pub trait Ssse3U16x8 {
+    fn hadd(self, other: Self) -> Self;
+    fn hsub(self, other: Self) -> Self;
+}
+impl Ssse3U16x8 for u16x8 {
+    #[inline]
+    fn hadd(self, other: Self) -> Self {
+        unsafe { bitcast!(x86_mm_hadd_epi16(self, other)) }
+    }
+    #[inline]
+    fn hsub(self, other: Self) -> Self {
+        unsafe { bitcast!(x86_mm_hsub_epi16(self, other)) }
+    }
+}
+
+
+// 8 bit integers
+
+pub trait Ssse3U8x16 {
+    fn shuffle_bytes(self, indices: Self) -> Self;
+    fn maddubs(self, other: i8x16) -> i16x8;
+}
+
+impl Ssse3U8x16 for u8x16 {
+    #[inline]
+    fn shuffle_bytes(self, indices: Self) -> Self {
+        unsafe {bitcast!(x86_mm_shuffle_epi8(self, indices))}
+    }
+
+    fn maddubs(self, other: i8x16) -> i16x8 {
+        unsafe { x86_mm_maddubs_epi16(self, other) }
+    }
+}
+
+pub trait Ssse3I8x16 {
+    fn abs(self) -> Self;
+    fn shuffle_bytes(self, indices: Self) -> Self;
+    fn sign(self, other: Self) -> Self;
+}
+impl Ssse3I8x16 for i8x16 {
+    #[inline]
+    fn abs(self) -> Self {
+        unsafe {x86_mm_abs_epi8(self)}
+    }
+    #[inline]
+    fn shuffle_bytes(self, indices: Self) -> Self {
+        unsafe {
+            x86_mm_shuffle_epi8(self, indices)
+        }
+    }
+
+    #[inline]
+    fn sign(self, other: Self) -> Self {
+        unsafe { x86_mm_sign_epi8(self, other) }
+    }
+}
--- a/toolkit/library/gtest/rust/Cargo.lock
+++ b/toolkit/library/gtest/rust/Cargo.lock
@ -301,6 +301,7 @@ version = "0.6.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "cfg-if 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
@ -852,6 +853,11 @@ dependencies = [
 "nodrop 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "simd"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
 [[package]]
 name = "siphasher"
 version = "0.2.1"
@ -1295,6 +1301,7 @@ dependencies = [
 "checksum serde 0.9.9 (registry+https://github.com/rust-lang/crates.io-index)" = "05a67b8a53f885f4b6e3ed183806035819f9862474e747fe4488a6d63bcbfcb7"
 "checksum serde_codegen_internals 0.14.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d52006899f910528a10631e5b727973fe668f3228109d1707ccf5bad5490b6e"
 "checksum serde_derive 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)" = "f15ea24bd037b2d64646b4d934fa99c649be66e3f7b29fb595a5543b212b1452"
+"checksum simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"
 "checksum siphasher 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2ffc669b726f2bc9a3bcff66e5e23b56ba6bf70e22a34c3d7b6d0b3450b65b84"
 "checksum smallvec 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "4f8266519bc1d17d0b5b16f6c21295625d562841c708f6376f49028a43e9c11e"
 "checksum smallvec 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2e40af10aafe98b4d8294ae8388d8a5cd0707c65d364872efe72d063ec44bee0"
--- a/toolkit/library/gtest/rust/Cargo.toml
+++ b/toolkit/library/gtest/rust/Cargo.toml
@ -11,7 +11,7 @@ servo = ["gkrust-shared/servo"]
 quantum_render = ["gkrust-shared/quantum_render"]
 cubeb_pulse_rust = ["gkrust-shared/cubeb_pulse_rust"]
 gecko_debug = ["gkrust-shared/gecko_debug"]
-# simd-accel = ["gkrust-shared/simd-accel"]
+simd-accel = ["gkrust-shared/simd-accel"]
 no-static-ideograph-encoder-tables = ["gkrust-shared/no-static-ideograph-encoder-tables"]
 # parallel-utf8 = ["gkrust-shared/parallel-utf8"]

--- a/toolkit/library/rust/Cargo.lock
+++ b/toolkit/library/rust/Cargo.lock
@ -299,6 +299,7 @@ version = "0.6.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "cfg-if 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
@ -839,6 +840,11 @@ dependencies = [
 "nodrop 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "simd"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
 [[package]]
 name = "siphasher"
 version = "0.2.1"
@ -1282,6 +1288,7 @@ dependencies = [
 "checksum serde 0.9.9 (registry+https://github.com/rust-lang/crates.io-index)" = "05a67b8a53f885f4b6e3ed183806035819f9862474e747fe4488a6d63bcbfcb7"
 "checksum serde_codegen_internals 0.14.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d52006899f910528a10631e5b727973fe668f3228109d1707ccf5bad5490b6e"
 "checksum serde_derive 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)" = "f15ea24bd037b2d64646b4d934fa99c649be66e3f7b29fb595a5543b212b1452"
+"checksum simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"
 "checksum siphasher 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2ffc669b726f2bc9a3bcff66e5e23b56ba6bf70e22a34c3d7b6d0b3450b65b84"
 "checksum smallvec 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "4f8266519bc1d17d0b5b16f6c21295625d562841c708f6376f49028a43e9c11e"
 "checksum smallvec 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2e40af10aafe98b4d8294ae8388d8a5cd0707c65d364872efe72d063ec44bee0"
--- a/toolkit/library/rust/Cargo.toml
+++ b/toolkit/library/rust/Cargo.toml
@ -11,7 +11,7 @@ servo = ["gkrust-shared/servo"]
 quantum_render = ["gkrust-shared/quantum_render"]
 cubeb_pulse_rust = ["gkrust-shared/cubeb_pulse_rust"]
 gecko_debug = ["gkrust-shared/gecko_debug"]
-# simd-accel = ["gkrust-shared/simd-accel"]
+simd-accel = ["gkrust-shared/simd-accel"]
 no-static-ideograph-encoder-tables = ["gkrust-shared/no-static-ideograph-encoder-tables"]
 # parallel-utf8 = ["gkrust-shared/parallel-utf8"]

--- a/toolkit/library/rust/gkrust-features.mozbuild
+++ b/toolkit/library/rust/gkrust-features.mozbuild
@ -20,4 +20,7 @@ if CONFIG['MOZ_BUILD_WEBRENDER']:
 if CONFIG['MOZ_PULSEAUDIO']:
    gkrust_features += ['cubeb_pulse_rust']

+if CONFIG['MOZ_RUST_SIMD']:
+    gkrust_features += ['simd-accel']
+
 gkrust_features += ['no-static-ideograph-encoder-tables']
--- a/toolkit/moz.configure
+++ b/toolkit/moz.configure
@ -795,6 +795,19 @@ set_config('MOZ_BUILD_WEBRENDER', webrender.build)
 set_define('MOZ_BUILD_WEBRENDER', webrender.build)
 set_config('MOZ_ENABLE_WEBRENDER', webrender.enable)

+# SIMD acceleration for Rust code (currently just encoding_rs)
+
+option('--enable-rust-simd', env='MOZ_RUST_SIMD',
+       help='Enable explicit SIMD in Rust code.')
+
+@depends('--enable-rust-simd')
+def rust_simd(value):
+    if value:
+        return True
+
+set_config('MOZ_RUST_SIMD', rust_simd)
+set_define('MOZ_RUST_SIMD', rust_simd)
+
 # Printing
 # ==============================================================
@depends(target)
				`@ -0,0 +1 @@`
				{"files":{".cargo-ok":"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",".gitignore":"695a11fe96751963cac851b3f556a8b922ae48ef73ee929cfe66164ea3db40cd",".travis.yml":"e2c720c3633b7671efce49147c62b12bcbf630d7c5d6fc65cd97620bfa4ddcea","Cargo.toml":"608aad04f17a524ee21048fa2ce9f656ae344e0473dd0e331dc954f0f9677c63","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6d3a9431e65e69c73a8923e6517b889d17549b23db406b9ec027710d16af701f","README.md":"249294a9a5f63c64c0f7fe4a607060f43f3507dce2378067aa59d25fb3ae681d","benches/mandelbrot.rs":"051b5199e66bca6cf7774e9024915fd4e1349ab39726a10a14e06b60d65d87a4","benches/matrix.rs":"048a21dacdb62365e0105d00d2c8cd6bd2396ac81134f2bff7eb4f7d095fb735","examples/axpy.rs":"4307626045d64ec08361c97c9c72c5dc8d361bdc88f64453b97ac0212041a1b2","examples/convert.rs":"8e658fde050f8a0d8b84ad7570446b10fcf544afbd551b940ca340474f324840","examples/dot-product.rs":"6fe2e007c147af5353804173a593c5b9d57dbccec156e1da37e9e32537363f91","examples/fannkuch-redux-nosimd.rs":"7b2fbde35e8666929d14d67328471cb0483d038a5325232f8db148b30865312b","examples/fannkuch-redux.rs":"ea21fdbd2274488a62cc984acad6e0b65d52f24fb4ff63b7057a3a667e9c8aae","examples/mandelbrot.rs":"8b8fdca1edac50e5a33e0e0592bd41eb75114f31839ccd40d485c61a9a664380","examples/matrix-inverse.rs":"a378d20ef20c2119bb10a86de27c92fec2c2f77f374e6bfd36707c9825a5fe92","examples/nbody-nosimd.rs":"2c8e0a7feacd202fdd65eeceb6420d6e9f43340b81f20a8e532704a587a2796b","examples/nbody.rs":"a864311affab262024479d6348ff51af43d809e9ad332ec30ea4aacceaa2eae1","examples/ops.rs":"1316f915d0afcfa98fdc4077e965ccccf6b4b21c433cbe487ff0cdc60df3cd39","examples/spectral-norm-nosimd.rs":"ffc8512ecde779078ea467f38f423a0ea623c63da7078193f9dd370200773f79","examples/spectral-norm.rs":"edb09c9d477f83939098cfb77a27cc298bc7a0c8a8e29cece0cccae0d70d890e","src/aarch64/mod.rs":"83f52775364c98de0cecb7e1509530c18972e932469f5f1522aa24a735d0fa37","src/aarch64/neon.rs":"1fe769979e07d8e2bc3c78ce116e05d735860744efe097a894cc9421153257fb","src/arm/mod.rs":"dcdd90bc0b39abaf86a0c8946d442b16313563fbae1ff03248628275c74d8617","src/arm/neon.rs":"51cc509856200e80f8e4cc2c982586e6d1cef593ec4537e153dce0cfe31d3428","src/common.rs":"62f4e7e0fefb52ad190d0f2191bc435ac4deab3f2bc70dc427f2a7f9ccb7856e","src/lib.rs":"25f0b39c038fa85af858318135dfd87865be26c33bb4bd1438aec96a1e68d8b5","src/sixty_four.rs":"510a9e00189a61e4f0a5beb7052d5dee37fc8261f94a2af45ef10327e0f3b7df","src/v256.rs":"2e328e49034876d535e0627c7a62191da2b4fb156a657614bf531a5fc75b1385","src/x86/avx.rs":"c66140abefca634b48eae307c3ec8cf5a40f2279b10e246a7e2ac602a2a2bb28","src/x86/avx2.rs":"efe3006b13a13261a3dec3d37dc1d8cb53950f3803c420069231803374949937","src/x86/mod.rs":"0acc5a5e2672e2a0fddc11065663be8b8fa2da87320ea291fa86ff8c2f33edf5","src/x86/sse2.rs":"5ceda75a401958a135fc9d851b22075314cdeed69fd483b6a7be4f11373f40da","src/x86/sse3.rs":"9bd01a4f08069ca4f445952e744d651efe887e3835b18872e757375f0d053bd2","src/x86/sse4_1.rs":"9ceb80dd70a7e7dfeef508cb935e1a2637175bc87a3b090f5dea691ff6aa0516","src/x86/sse4_2.rs":"c59321aed8decdce4d0d8570cff46aed02e1a8265647ef7702e9b180fc581254","src/x86/ssse3.rs":"2290f0269bae316b8e0491495645ee38a9bd73525c8572759c1328341c3bdb4c"},"package":"7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"}