Bug 1261841 part 4 - Add a configuration option for enabling explicit SIMD in Rust. r=froydnj.

MozReview-Commit-ID: ICifcJ9499a
This commit is contained in:
Henri Sivonen 2017-05-29 17:11:18 +03:00
Родитель d35d69cef8
Коммит 763d66dd51
45 изменённых файлов: 6149 добавлений и 3 удалений

Просмотреть файл

@ -963,6 +963,11 @@ else
environment_cleaner =
endif
rust_unlock_unstable =
ifdef MOZ_RUST_SIMD
rust_unlock_unstable += RUSTC_BOOTSTRAP=1
endif
# This function is intended to be called by:
#
# $(call CARGO_BUILD,EXTRA_ENV_VAR1=X EXTRA_ENV_VAR2=Y ...)
@ -971,7 +976,7 @@ endif
#
# $(call CARGO_BUILD)
define CARGO_BUILD
env $(environment_cleaner) $(rustflags_override) \
env $(environment_cleaner) $(rust_unlock_unstable) $(rustflags_override) \
CARGO_TARGET_DIR=$(CARGO_TARGET_DIR) \
RUSTC=$(RUSTC) \
MOZ_SRC=$(topsrcdir) \

1
third_party/rust/simd/.cargo-checksum.json поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
{"files":{".cargo-ok":"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",".gitignore":"695a11fe96751963cac851b3f556a8b922ae48ef73ee929cfe66164ea3db40cd",".travis.yml":"e2c720c3633b7671efce49147c62b12bcbf630d7c5d6fc65cd97620bfa4ddcea","Cargo.toml":"608aad04f17a524ee21048fa2ce9f656ae344e0473dd0e331dc954f0f9677c63","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6d3a9431e65e69c73a8923e6517b889d17549b23db406b9ec027710d16af701f","README.md":"249294a9a5f63c64c0f7fe4a607060f43f3507dce2378067aa59d25fb3ae681d","benches/mandelbrot.rs":"051b5199e66bca6cf7774e9024915fd4e1349ab39726a10a14e06b60d65d87a4","benches/matrix.rs":"048a21dacdb62365e0105d00d2c8cd6bd2396ac81134f2bff7eb4f7d095fb735","examples/axpy.rs":"4307626045d64ec08361c97c9c72c5dc8d361bdc88f64453b97ac0212041a1b2","examples/convert.rs":"8e658fde050f8a0d8b84ad7570446b10fcf544afbd551b940ca340474f324840","examples/dot-product.rs":"6fe2e007c147af5353804173a593c5b9d57dbccec156e1da37e9e32537363f91","examples/fannkuch-redux-nosimd.rs":"7b2fbde35e8666929d14d67328471cb0483d038a5325232f8db148b30865312b","examples/fannkuch-redux.rs":"ea21fdbd2274488a62cc984acad6e0b65d52f24fb4ff63b7057a3a667e9c8aae","examples/mandelbrot.rs":"8b8fdca1edac50e5a33e0e0592bd41eb75114f31839ccd40d485c61a9a664380","examples/matrix-inverse.rs":"a378d20ef20c2119bb10a86de27c92fec2c2f77f374e6bfd36707c9825a5fe92","examples/nbody-nosimd.rs":"2c8e0a7feacd202fdd65eeceb6420d6e9f43340b81f20a8e532704a587a2796b","examples/nbody.rs":"a864311affab262024479d6348ff51af43d809e9ad332ec30ea4aacceaa2eae1","examples/ops.rs":"1316f915d0afcfa98fdc4077e965ccccf6b4b21c433cbe487ff0cdc60df3cd39","examples/spectral-norm-nosimd.rs":"ffc8512ecde779078ea467f38f423a0ea623c63da7078193f9dd370200773f79","examples/spectral-norm.rs":"edb09c9d477f83939098cfb77a27cc298bc7a0c8a8e29cece0cccae0d70d890e","src/aarch64/mod.rs":"83f52775364c98de0cecb7e1509530c18972e932469f5f1522aa24a735d0fa37","src/aarch64/neon.rs":"1fe769979e07d8e2bc3c78ce116e05d735860744efe097a894cc9421153257fb","src/arm/mod.rs":"dcdd90bc0b39abaf86a0c8946d442b16313563fbae1ff03248628275c74d8617","src/arm/neon.rs":"51cc509856200e80f8e4cc2c982586e6d1cef593ec4537e153dce0cfe31d3428","src/common.rs":"62f4e7e0fefb52ad190d0f2191bc435ac4deab3f2bc70dc427f2a7f9ccb7856e","src/lib.rs":"25f0b39c038fa85af858318135dfd87865be26c33bb4bd1438aec96a1e68d8b5","src/sixty_four.rs":"510a9e00189a61e4f0a5beb7052d5dee37fc8261f94a2af45ef10327e0f3b7df","src/v256.rs":"2e328e49034876d535e0627c7a62191da2b4fb156a657614bf531a5fc75b1385","src/x86/avx.rs":"c66140abefca634b48eae307c3ec8cf5a40f2279b10e246a7e2ac602a2a2bb28","src/x86/avx2.rs":"efe3006b13a13261a3dec3d37dc1d8cb53950f3803c420069231803374949937","src/x86/mod.rs":"0acc5a5e2672e2a0fddc11065663be8b8fa2da87320ea291fa86ff8c2f33edf5","src/x86/sse2.rs":"5ceda75a401958a135fc9d851b22075314cdeed69fd483b6a7be4f11373f40da","src/x86/sse3.rs":"9bd01a4f08069ca4f445952e744d651efe887e3835b18872e757375f0d053bd2","src/x86/sse4_1.rs":"9ceb80dd70a7e7dfeef508cb935e1a2637175bc87a3b090f5dea691ff6aa0516","src/x86/sse4_2.rs":"c59321aed8decdce4d0d8570cff46aed02e1a8265647ef7702e9b180fc581254","src/x86/ssse3.rs":"2290f0269bae316b8e0491495645ee38a9bd73525c8572759c1328341c3bdb4c"},"package":"7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"}

0
third_party/rust/simd/.cargo-ok поставляемый Normal file
Просмотреть файл

3
third_party/rust/simd/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,3 @@
perf.data*
/target
Cargo.lock

24
third_party/rust/simd/.travis.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,24 @@
language: rust
sudo: false
rust:
- nightly
# load travis-cargo
before_script:
- |
pip install 'travis-cargo<0.2' --user &&
export PATH=$HOME/.local/bin:$PATH
# the main build
script:
- |
travis-cargo build &&
travis-cargo test &&
travis-cargo bench &&
travis-cargo doc -- --features doc
env:
global:
# override the default `--features unstable` used for the nightly branch (optional)
- TRAVIS_CARGO_NIGHTLY_FEATURE="with-serde"

26
third_party/rust/simd/Cargo.toml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,26 @@
[package]
name = "simd"
version = "0.2.0"
authors = ["Huon Wilson <dbau.pp+github@gmail.com>"]
repository = "https://github.com/rust-lang-nursery/simd"
documentation = "https://rust-lang-nursery.github.io/simd/doc/simd"
license = "MIT/Apache-2.0"
keywords = ["simd", "data-parallel"]
readme = "README.md"
description = """
`simd` offers limited cross-platform access to SIMD instructions on
CPUs, as well as raw interfaces to platform-specific instructions.
"""
[dependencies]
serde = { version = "0.8", optional = true }
serde_derive = { version = "0.8", optional = true }
[dev-dependencies]
cfg-if = "0.1"
[features]
doc = []
with-serde = ["serde", "serde_derive"]

201
third_party/rust/simd/LICENSE-APACHE поставляемый Normal file
Просмотреть файл

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

25
third_party/rust/simd/LICENSE-MIT поставляемый Normal file
Просмотреть файл

@ -0,0 +1,25 @@
Copyright (c) 2014 Huon Wilson
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

7
third_party/rust/simd/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,7 @@
# `simd`
[![Build Status](https://travis-ci.org/rust-lang-nursery/simd.png)](https://travis-ci.org/rust-lang-nursery/simd)
`simd` offers a basic interface to the SIMD functionality of CPUs.
[Documentation](https://docs.rs/simd)

117
third_party/rust/simd/benches/mandelbrot.rs поставляемый Executable file
Просмотреть файл

@ -0,0 +1,117 @@
#![feature(test)]
#![feature(cfg_target_feature)]
extern crate simd;
extern crate test;
use test::black_box as bb;
use test::Bencher as B;
use simd::{f32x4, u32x4};
#[cfg(any(target_feature = "avx", target_feature = "avx2"))]
use simd::x86::avx::{f32x8, u32x8};
fn naive(c_x: f32, c_y: f32, max_iter: u32) -> u32 {
let mut x = c_x;
let mut y = c_y;
let mut count = 0;
while count < max_iter {
let xy = x * y;
let xx = x * x;
let yy = y * y;
let sum = xx + yy;
if sum > 4.0 {
break
}
count += 1;
x = xx - yy + c_x;
y = xy * 2.0 + c_y;
}
count
}
fn simd4(c_x: f32x4, c_y: f32x4, max_iter: u32) -> u32x4 {
let mut x = c_x;
let mut y = c_y;
let mut count = u32x4::splat(0);
for _ in 0..max_iter as usize {
let xy = x * y;
let xx = x * x;
let yy = y * y;
let sum = xx + yy;
let mask = sum.lt(f32x4::splat(4.0));
if !mask.any() { break }
count = count + mask.to_i().select(u32x4::splat(1), u32x4::splat(0));
x = xx - yy + c_x;
y = xy + xy + c_y;
}
count
}
#[cfg(target_feature = "avx")]
fn simd8(c_x: f32x8, c_y: f32x8, max_iter: u32) -> u32x8 {
let mut x = c_x;
let mut y = c_y;
let mut count = u32x8::splat(0);
for _ in 0..max_iter as usize {
let xy = x * y;
let xx = x * x;
let yy = y * y;
let sum = xx + yy;
let mask = sum.lt(f32x8::splat(4.0));
if !mask.any() { break }
count = count + mask.to_i().select(u32x8::splat(1), u32x8::splat(0));
x = xx - yy + c_x;
y = xy + xy + c_y;
}
count
}
const SCALE: f32 = 3.0 / 100.0;
const N: u32 = 100;
#[bench]
fn mandel_naive(b: &mut B) {
b.iter(|| {
for j in 0..100 {
let y = -1.5 + (j as f32) * SCALE;
for i in 0..100 {
let x = -2.2 + (i as f32) * SCALE;
bb(naive(x, y, N));
}
}
})
}
#[bench]
fn mandel_simd4(b: &mut B) {
let tweak = u32x4::new(0, 1, 2, 3);
b.iter(|| {
for j in 0..100 {
let y = f32x4::splat(-1.5) + f32x4::splat(SCALE) * u32x4::splat(j).to_f32();
for i in 0..25 {
let i = u32x4::splat(i * 4) + tweak;
let x = f32x4::splat(-2.2) + f32x4::splat(SCALE) * i.to_f32();
bb(simd4(x, y, N));
}
}
})
}
#[cfg(any(target_feature = "avx", target_feature = "avx2"))]
#[bench]
fn mandel_simd8(b: &mut B) {
let tweak = u32x8::new(0, 1, 2, 3, 4, 5, 6, 7);
b.iter(|| {
for j in 0..100 {
let y = f32x8::splat(-1.5) + f32x8::splat(SCALE) * u32x8::splat(j).to_f32();
for i in 0..13 { // 100 not divisible by 8 :(
let i = u32x8::splat(i * 8) + tweak;
let x = f32x8::splat(-2.2) + f32x8::splat(SCALE) * i.to_f32();
bb(simd8(x, y, N));
}
}
})
}

485
third_party/rust/simd/benches/matrix.rs поставляемый Executable file
Просмотреть файл

@ -0,0 +1,485 @@
#![feature(test)]
#![feature(cfg_target_feature)]
extern crate test;
extern crate simd;
use test::black_box as bb;
use test::Bencher as B;
use simd::f32x4;
#[cfg(target_feature = "avx")]
use simd::x86::avx::{f32x8, f64x4};
// #[cfg(target_feature = "avx2")]
// use simd::x86::avx2::Avx2F32x8;
#[bench]
fn multiply_naive(b: &mut B) {
let x = [[1.0_f32; 4]; 4];
let y = [[2.0; 4]; 4];
b.iter(|| {
for _ in 0..100 {
let (x, y) = bb((&x, &y));
bb(&[[x[0][0] * y[0][0] + x[1][0] * y[0][1] + x[2][0] * y[0][2] + x[3][0] * y[0][3],
x[0][1] * y[0][0] + x[1][1] * y[0][1] + x[2][1] * y[0][2] + x[3][1] * y[0][3],
x[0][2] * y[0][0] + x[1][2] * y[0][1] + x[2][2] * y[0][2] + x[3][2] * y[0][3],
x[0][3] * y[0][0] + x[1][3] * y[0][1] + x[2][3] * y[0][2] + x[3][3] * y[0][3]],
[x[0][0] * y[1][0] + x[1][0] * y[1][1] + x[2][0] * y[1][2] + x[3][0] * y[1][3],
x[0][1] * y[1][0] + x[1][1] * y[1][1] + x[2][1] * y[1][2] + x[3][1] * y[1][3],
x[0][2] * y[1][0] + x[1][2] * y[1][1] + x[2][2] * y[1][2] + x[3][2] * y[1][3],
x[0][3] * y[1][0] + x[1][3] * y[1][1] + x[2][3] * y[1][2] + x[3][3] * y[1][3]],
[x[0][0] * y[2][0] + x[1][0] * y[2][1] + x[2][0] * y[2][2] + x[3][0] * y[2][3],
x[0][1] * y[2][0] + x[1][1] * y[2][1] + x[2][1] * y[2][2] + x[3][1] * y[2][3],
x[0][2] * y[2][0] + x[1][2] * y[2][1] + x[2][2] * y[2][2] + x[3][2] * y[2][3],
x[0][3] * y[2][0] + x[1][3] * y[2][1] + x[2][3] * y[2][2] + x[3][3] * y[2][3]],
[x[0][0] * y[3][0] + x[1][0] * y[3][1] + x[2][0] * y[3][2] + x[3][0] * y[3][3],
x[0][1] * y[3][0] + x[1][1] * y[3][1] + x[2][1] * y[3][2] + x[3][1] * y[3][3],
x[0][2] * y[3][0] + x[1][2] * y[3][1] + x[2][2] * y[3][2] + x[3][2] * y[3][3],
x[0][3] * y[3][0] + x[1][3] * y[3][1] + x[2][3] * y[3][2] + x[3][3] * y[3][3]],
]);
}
})
}
#[bench]
fn multiply_simd4_32(b: &mut B) {
let x = [f32x4::splat(1.0_f32); 4];
let y = [f32x4::splat(2.0); 4];
b.iter(|| {
for _ in 0..100 {
let (x, y) = bb((&x, &y));
let y0 = y[0];
let y1 = y[1];
let y2 = y[2];
let y3 = y[3];
bb(&[f32x4::splat(y0.extract(0)) * x[0] +
f32x4::splat(y0.extract(1)) * x[1] +
f32x4::splat(y0.extract(2)) * x[2] +
f32x4::splat(y0.extract(3)) * x[3],
f32x4::splat(y1.extract(0)) * x[0] +
f32x4::splat(y1.extract(1)) * x[1] +
f32x4::splat(y1.extract(2)) * x[2] +
f32x4::splat(y1.extract(3)) * x[3],
f32x4::splat(y2.extract(0)) * x[0] +
f32x4::splat(y2.extract(1)) * x[1] +
f32x4::splat(y2.extract(2)) * x[2] +
f32x4::splat(y2.extract(3)) * x[3],
f32x4::splat(y3.extract(0)) * x[0] +
f32x4::splat(y3.extract(1)) * x[1] +
f32x4::splat(y3.extract(2)) * x[2] +
f32x4::splat(y3.extract(3)) * x[3],
]);
}
})
}
#[cfg(target_feature = "avx")]
#[bench]
fn multiply_simd4_64(b: &mut B) {
let x = [f64x4::splat(1.0_f64); 4];
let y = [f64x4::splat(2.0); 4];
b.iter(|| {
for _ in 0..100 {
let (x, y) = bb((&x, &y));
let y0 = y[0];
let y1 = y[1];
let y2 = y[2];
let y3 = y[3];
bb(&[f64x4::splat(y0.extract(0)) * x[0] +
f64x4::splat(y0.extract(1)) * x[1] +
f64x4::splat(y0.extract(2)) * x[2] +
f64x4::splat(y0.extract(3)) * x[3],
f64x4::splat(y1.extract(0)) * x[0] +
f64x4::splat(y1.extract(1)) * x[1] +
f64x4::splat(y1.extract(2)) * x[2] +
f64x4::splat(y1.extract(3)) * x[3],
f64x4::splat(y2.extract(0)) * x[0] +
f64x4::splat(y2.extract(1)) * x[1] +
f64x4::splat(y2.extract(2)) * x[2] +
f64x4::splat(y2.extract(3)) * x[3],
f64x4::splat(y3.extract(0)) * x[0] +
f64x4::splat(y3.extract(1)) * x[1] +
f64x4::splat(y3.extract(2)) * x[2] +
f64x4::splat(y3.extract(3)) * x[3],
]);
}
})
}
#[bench]
fn inverse_naive(b: &mut B) {
let mut x = [[0_f32; 4]; 4];
for i in 0..4 { x[i][i] = 1.0 }
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
let mut t = [[0_f32; 4]; 4];
for i in 0..4 {
t[0][i] = x[i][0];
t[1][i] = x[i][1];
t[2][i] = x[i][2];
t[3][i] = x[i][3];
}
let _0 = t[2][2] * t[3][3];
let _1 = t[2][3] * t[3][2];
let _2 = t[2][1] * t[3][3];
let _3 = t[2][3] * t[3][1];
let _4 = t[2][1] * t[3][2];
let _5 = t[2][2] * t[3][1];
let _6 = t[2][0] * t[3][3];
let _7 = t[2][3] * t[3][0];
let _8 = t[2][0] * t[3][2];
let _9 = t[2][2] * t[3][0];
let _10 = t[2][0] * t[3][1];
let _11 = t[2][1] * t[3][0];
let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
(_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
(_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
(_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
(_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
(_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
(_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
(_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
(_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);
let _0 = t[0][2] * t[1][3];
let _1 = t[0][3] * t[1][2];
let _2 = t[0][1] * t[1][3];
let _3 = t[0][3] * t[1][1];
let _4 = t[0][1] * t[1][2];
let _5 = t[0][2] * t[1][1];
let _6 = t[0][0] * t[1][3];
let _7 = t[0][3] * t[1][0];
let _8 = t[0][0] * t[1][2];
let _9 = t[0][2] * t[1][0];
let _10 = t[0][0] * t[1][1];
let _11 = t[0][1] * t[1][0];
let d20 = _0*t[3][1] + _3*t[3][2] + _4*t[3][3]-
(_1*t[3][1] + _2*t[3][2] + _5*t[3][3]);
let d21 = _1*t[3][0] + _6*t[3][2] + _9*t[3][3]-
(_0*t[3][0] + _7*t[3][2] + _8*t[3][3]);
let d22 = _2*t[3][0] + _7*t[3][1] + _10*t[3][3]-
(_3*t[3][0] + _6*t[3][1] + _11*t[3][3]);
let d23 = _5*t[3][0] + _8*t[3][1] + _11*t[3][2]-
(_4*t[3][0] + _9*t[3][1] + _10*t[3][2]);
let d30 = _2*t[2][2] + _5*t[2][3] + _1*t[2][1]-
(_4*t[2][3] + _0*t[2][1] + _3*t[2][2]);
let d31 = _8*t[2][3] + _0*t[2][0] + _7*t[2][2]-
(_6*t[2][2] + _9*t[2][3] + _1*t[2][0]);
let d32 = _6*t[2][1] + _11*t[2][3] + _3*t[2][0]-
(_10*t[2][3] + _2*t[2][0] + _7*t[2][1]);
let d33 = _10*t[2][2] + _4*t[2][0] + _9*t[2][1]-
(_8*t[2][1] + _11*t[2][2] + _5*t[2][0]);
let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;
let det = 1.0 / det;
let mut ret = [[d00, d01, d02, d03],
[d10, d11, d12, d13],
[d20, d21, d22, d23],
[d30, d31, d32, d33]];
for i in 0..4 {
for j in 0..4 {
ret[i][j] *= det;
}
}
bb(&ret);
}
})
}
#[bench]
fn inverse_simd4(b: &mut B) {
let mut x = [f32x4::splat(0_f32); 4];
for i in 0..4 { x[i] = x[i].replace(i as u32, 1.0); }
fn shuf0145(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(0), v.extract(1),
w.extract(4 - 4), w.extract(5 - 4))
}
fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(0), v.extract(2),
w.extract(4 - 4), w.extract(6 - 4))
}
fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(1), v.extract(3),
w.extract(5 - 4), w.extract(7 - 4))
}
fn shuf2367(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(2), v.extract(3),
w.extract(6 - 4), w.extract(7 - 4))
}
fn swiz1032(v: f32x4) -> f32x4 {
f32x4::new(v.extract(1), v.extract(0),
v.extract(3), v.extract(2))
}
fn swiz2301(v: f32x4) -> f32x4 {
f32x4::new(v.extract(2), v.extract(3),
v.extract(0), v.extract(1))
}
b.iter(|| {
for _ in 0..100 {
let src0;
let src1;
let src2;
let src3;
let mut tmp1;
let row0;
let mut row1;
let mut row2;
let mut row3;
let mut minor0;
let mut minor1;
let mut minor2;
let mut minor3;
let mut det;
let x = bb(&x);
src0 = x[0];
src1 = x[1];
src2 = x[2];
src3 = x[3];
tmp1 = shuf0145(src0, src1);
row1 = shuf0145(src2, src3);
row0 = shuf0246(tmp1, row1);
row1 = shuf1357(row1, tmp1);
tmp1 = shuf2367(src0, src1);
row3 = shuf2367(src2, src3);
row2 = shuf0246(tmp1, row3);
row3 = shuf0246(row3, tmp1);
tmp1 = row2 * row3;
tmp1 = swiz1032(tmp1);
minor0 = row1 * tmp1;
minor1 = row0 * tmp1;
tmp1 = swiz2301(tmp1);
minor0 = (row1 * tmp1) - minor0;
minor1 = (row0 * tmp1) - minor1;
minor1 = swiz2301(minor1);
tmp1 = row1 * row2;
tmp1 = swiz1032(tmp1);
minor0 = (row3 * tmp1) + minor0;
minor3 = row0 * tmp1;
tmp1 = swiz2301(tmp1);
minor0 = minor0 - row3 * tmp1;
minor3 = row0 * tmp1 - minor3;
minor3 = swiz2301(minor3);
tmp1 = row3 * swiz2301(row1);
tmp1 = swiz1032(tmp1);
row2 = swiz2301(row2);
minor0 = row2 * tmp1 + minor0;
minor2 = row0 * tmp1;
tmp1 = swiz2301(tmp1);
minor0 = minor0 - row2 * tmp1;
minor2 = row0 * tmp1 - minor2;
minor2 = swiz2301(minor2);
tmp1 = row0 * row1;
tmp1 = swiz1032(tmp1);
minor2 = minor2 + row3 * tmp1;
minor3 = row2 * tmp1 - minor3;
tmp1 = swiz2301(tmp1);
minor2 = row3 * tmp1 - minor2;
minor3 = minor3 - row2 * tmp1;
tmp1 = row0 * row3;
tmp1 = swiz1032(tmp1);
minor1 = minor1 - row2 * tmp1;
minor2 = row1 * tmp1 + minor2;
tmp1 = swiz2301(tmp1);
minor1 = row2 * tmp1 + minor1;
minor2 = minor2 - row1 * tmp1;
tmp1 = row0 * row2;
tmp1 = swiz1032(tmp1);
minor1 = row3 * tmp1 + minor1;
minor3 = minor3 - row1 * tmp1;
tmp1 = swiz2301(tmp1);
minor1 = minor1 - row3 * tmp1;
minor3 = row1 * tmp1 + minor3;
det = row0 * minor0;
det = swiz2301(det) + det;
det = swiz1032(det) + det;
//tmp1 = det.approx_reciprocal(); det = tmp1 * (f32x4::splat(2.0) - det * tmp1);
det = f32x4::splat(1.0) / det;
bb(&[minor0 * det, minor1 * det, minor2 * det, minor3 * det]);
}
})
}
#[bench]
fn transpose_naive(b: &mut B) {
let x = [[0_f32; 4]; 4];
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
bb(&[[x[0][0], x[1][0], x[2][0], x[3][0]],
[x[0][1], x[1][1], x[2][1], x[3][1]],
[x[0][2], x[1][2], x[2][2], x[3][2]],
[x[0][3], x[1][3], x[2][3], x[3][3]]]);
}
})
}
#[bench]
fn transpose_simd4(b: &mut B) {
let x = [f32x4::splat(0_f32); 4];
fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(0), v.extract(2),
w.extract(4 - 4), w.extract(6 - 4))
}
fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(1), v.extract(3),
w.extract(5 - 4), w.extract(7 - 4))
}
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
let x0 = x[0];
let x1 = x[1];
let x2 = x[2];
let x3 = x[3];
let a0 = shuf0246(x0, x1);
let a1 = shuf0246(x2, x3);
let a2 = shuf1357(x0, x1);
let a3 = shuf1357(x2, x3);
let b0 = shuf0246(a0, a1);
let b1 = shuf0246(a2, a3);
let b2 = shuf1357(a0, a1);
let b3 = shuf1357(a2, a3);
bb(&[b0, b1, b2, b3]);
}
})
}
#[cfg(target_feature = "avx")]
#[bench]
fn transpose_simd8_naive(b: &mut B) {
let x = [f32x8::splat(0_f32); 2];
fn shuf0246(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(0), v.extract(2), v.extract(4), v.extract(6),
w.extract(0), w.extract(2), w.extract(4), w.extract(6))
}
fn shuf1357(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(1), v.extract(3), v.extract(5), v.extract(7),
w.extract(1), w.extract(3), w.extract(5), w.extract(7),)
}
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
let x01 = x[0];
let x23 = x[1];
let a01 = shuf0246(x01, x23);
let a23 = shuf1357(x01, x23);
let b01 = shuf0246(a01, a23);
let b23 = shuf1357(a01, a23);
bb(&[b01, b23]);
}
})
}
#[cfg(target_feature = "avx")]
#[bench]
fn transpose_simd8_avx2_vpermps(b: &mut B) {
let x = [f32x8::splat(0_f32); 2];
// efficient on AVX2 using vpermps
fn perm04152637(v: f32x8) -> f32x8 {
// broken on rustc 1.7.0-nightly (1ddaf8bdf 2015-12-12)
// v.permutevar(i32x8::new(0, 4, 1, 5, 2, 6, 3, 7))
f32x8::new(v.extract(0), v.extract(4), v.extract(1), v.extract(5),
v.extract(2), v.extract(6), v.extract(3), v.extract(7))
}
fn shuf_lo(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(0), v.extract(1), w.extract(0), w.extract(1),
v.extract(4), v.extract(5), w.extract(4), w.extract(5),)
}
fn shuf_hi(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(2), v.extract(3), w.extract(2), w.extract(3),
v.extract(6), v.extract(7), w.extract(6), w.extract(7),)
}
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
let x01 = x[0];
let x23 = x[1];
let a01 = perm04152637(x01);
let a23 = perm04152637(x23);
let b01 = shuf_lo(a01, a23);
let b23 = shuf_hi(a01, a23);
bb(&[b01, b23]);
}
})
}
#[cfg(target_feature = "avx")]
#[bench]
fn transpose_simd8_avx2_vpermpd(b: &mut B) {
let x = [f32x8::splat(0_f32); 2];
// efficient on AVX2 using vpermpd
fn perm01452367(v: f32x8) -> f32x8 {
f32x8::new(v.extract(0), v.extract(1), v.extract(4), v.extract(5),
v.extract(2), v.extract(3), v.extract(6), v.extract(7))
}
fn shuf_lo_ps(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(0), w.extract(0), v.extract(1), w.extract(1),
v.extract(4), w.extract(4), v.extract(5), w.extract(5),)
}
fn shuf_hi_ps(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(2), w.extract(2), v.extract(3), w.extract(3),
v.extract(6), w.extract(6), v.extract(7), w.extract(7),)
}
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
let x01 = x[0];
let x23 = x[1];
let a01 = perm01452367(x01);
let a23 = perm01452367(x23);
let b01 = shuf_lo_ps(a01, a23);
let b23 = shuf_hi_ps(a01, a23);
bb(&[b01, b23]);
}
})
}

65
third_party/rust/simd/examples/axpy.rs поставляемый Executable file
Просмотреть файл

@ -0,0 +1,65 @@
#![feature(cfg_target_feature)]
extern crate simd;
use simd::f32x4;
#[cfg(target_feature = "avx")]
use simd::x86::avx::f32x8;
#[inline(never)]
pub fn axpy(z: &mut [f32], a: f32, x: &[f32], y: &[f32]) {
assert_eq!(x.len(), y.len());
assert_eq!(x.len(), z.len());
let len = std::cmp::min(std::cmp::min(x.len(), y.len()), z.len());
let mut i = 0;
while i < len & !3 {
let x = f32x4::load(x, i);
let y = f32x4::load(y, i);
(f32x4::splat(a) * x + y).store(z, i);
i += 4
}
}
#[cfg(target_feature = "avx")]
#[inline(never)]
pub fn axpy8(z: &mut [f32], a: f32, x: &[f32], y: &[f32]) {
assert_eq!(x.len(), y.len());
assert_eq!(x.len(), z.len());
let len = std::cmp::min(std::cmp::min(x.len(), y.len()), z.len());
let mut i = 0;
while i < len & !7 {
let x = f32x8::load(x, i);
let y = f32x8::load(y, i);
(f32x8::splat(a) * x + y).store(z, i);
i += 8
}
}
#[cfg(not(target_feature = "avx"))]
pub fn axpy8(_: &mut [f32], _: f32, _: &[f32], _: &[f32]) {
unimplemented!()
}
fn main() {
let mut z = vec![0.; 4];
axpy(&mut z, 2., &[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]);
println!("{:?}", z);
let mut z = vec![0.; 8];
axpy(&mut z, 3., &[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
&[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]);
println!("{:?}", z);
if cfg!(target_feature = "avx") {
let mut z = vec![0.; 4];
axpy8(&mut z, 2., &[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]);
println!("{:?}", z);
let mut z = vec![0.; 8];
axpy8(&mut z, 3., &[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
&[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]);
println!("{:?}", z);
}
}

38
third_party/rust/simd/examples/convert.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,38 @@
extern crate simd;
use simd::f32x4;
#[inline(never)]
pub fn convert_scalar(x: &mut [i32], y: &[f32]) {
assert_eq!(x.len(), y.len());
let mut i = 0;
while i < x.len() & !3 {
x[i] = y[i] as i32;
i += 1;
}
}
#[inline(never)]
pub fn convert(x: &mut [i32], y: &[f32]) {
assert_eq!(x.len(), y.len());
let mut i = 0;
while i < x.len() & !3 {
let v = f32x4::load(y, i);
v.to_i32().store(x, i);
i += 4
}
}
fn main() {
let x = &mut [0; 12];
let y = [1.0; 12];
convert(x, &y);
convert_scalar(x, &y);
println!("{:?}", x);
let x = &mut [0; 16];
let y = [1.0; 16];
convert(x, &y);
convert_scalar(x, &y);
println!("{:?}", x);
}

60
third_party/rust/simd/examples/dot-product.rs поставляемый Executable file
Просмотреть файл

@ -0,0 +1,60 @@
#![feature(cfg_target_feature)]
extern crate simd;
use simd::f32x4;
#[cfg(target_feature = "avx")]
use simd::x86::avx::{f32x8, LowHigh128};
#[inline(never)]
pub fn dot(x: &[f32], y: &[f32]) -> f32 {
assert_eq!(x.len(), y.len());
let len = std::cmp::min(x.len(), y.len());
let mut sum = f32x4::splat(0.0);
let mut i = 0;
while i < len & !3 {
let x = f32x4::load(x, i);
let y = f32x4::load(y, i);
sum = sum + x * y;
i += 4
}
sum.extract(0) + sum.extract(1) + sum.extract(2) + sum.extract(3)
}
#[cfg(target_feature = "avx")]
#[inline(never)]
pub fn dot8(x: &[f32], y: &[f32]) -> f32 {
assert_eq!(x.len(), y.len());
let len = std::cmp::min(x.len(), y.len());
let mut sum = f32x8::splat(0.0);
let mut i = 0;
while i < len & !7 {
let x = f32x8::load(x, i);
let y = f32x8::load(y, i);
sum = sum + x * y;
i += 8
}
let sum = sum.low() + sum.high();
sum.extract(0) + sum.extract(1) + sum.extract(2) + sum.extract(3)
}
#[cfg(not(target_feature = "avx"))]
pub fn dot8(_: &[f32], _: &[f32]) -> f32 {
unimplemented!()
}
fn main() {
println!("{}", dot(&[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]));
println!("{}", dot(&[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
&[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]));
if cfg!(target_feature = "avx") {
println!("{}", dot8(&[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]));
println!("{}", dot8(&[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
&[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]));
}
}

156
third_party/rust/simd/examples/fannkuch-redux-nosimd.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,156 @@
// The Computer Language Benchmarks Game
// http://benchmarksgame.alioth.debian.org/
//
// contributed by the Rust Project Developers
// contributed by TeXitoi
use std::{cmp, mem};
use std::thread;
fn rotate(x: &mut [i32]) {
let mut prev = x[0];
for place in x.iter_mut().rev() {
prev = mem::replace(place, prev)
}
}
fn next_permutation(perm: &mut [i32], count: &mut [i32]) {
for i in 1..perm.len() {
rotate(&mut perm[.. i + 1]);
let count_i = &mut count[i];
if *count_i >= i as i32 {
*count_i = 0;
} else {
*count_i += 1;
break
}
}
}
#[derive(Clone, Copy)]
struct P {
p: [i32; 16],
}
#[derive(Clone, Copy)]
struct Perm {
cnt: [i32; 16],
fact: [u32; 16],
n: u32,
permcount: u32,
perm: P,
}
impl Perm {
fn new(n: u32) -> Perm {
let mut fact = [1; 16];
for i in 1 .. n as usize + 1 {
fact[i] = fact[i - 1] * i as u32;
}
Perm {
cnt: [0; 16],
fact: fact,
n: n,
permcount: 0,
perm: P { p: [0; 16 ] }
}
}
fn get(&mut self, mut idx: i32) -> P {
let mut pp = [0u8; 16];
self.permcount = idx as u32;
for (i, place) in self.perm.p.iter_mut().enumerate() {
*place = i as i32 + 1;
}
for i in (1 .. self.n as usize).rev() {
let d = idx / self.fact[i] as i32;
self.cnt[i] = d;
idx %= self.fact[i] as i32;
for (place, val) in pp.iter_mut().zip(self.perm.p[..(i+1)].iter()) {
*place = (*val) as u8
}
let d = d as usize;
for j in 0 .. i + 1 {
self.perm.p[j] = if j + d <= i {pp[j + d]} else {pp[j+d-i-1]} as i32;
}
}
self.perm
}
fn count(&self) -> u32 { self.permcount }
fn max(&self) -> u32 { self.fact[self.n as usize] }
fn next(&mut self) -> P {
next_permutation(&mut self.perm.p, &mut self.cnt);
self.permcount += 1;
self.perm
}
}
fn reverse(tperm: &mut [i32], k: usize) {
tperm[..k].reverse()
}
fn work(mut perm: Perm, n: usize, max: usize) -> (i32, i32) {
let mut checksum = 0;
let mut maxflips = 0;
let mut p = perm.get(n as i32);
while perm.count() < max as u32 {
let mut flips = 0;
while p.p[0] != 1 {
let k = p.p[0] as usize;
reverse(&mut p.p, k);
flips += 1;
}
checksum += if perm.count() % 2 == 0 {flips} else {-flips};
maxflips = cmp::max(maxflips, flips);
p = perm.next();
}
(checksum, maxflips)
}
fn fannkuch(n: i32) -> (i32, i32) {
let perm = Perm::new(n as u32);
let n = 1;
let mut futures = vec![];
let k = perm.max() / n;
for j in (0..).map(|x| x * k).take_while(|&j| j < k * n) {
let max = cmp::min(j+k, perm.max());
futures.push(thread::spawn(move|| {
work(perm, j as usize, max as usize)
}))
}
let mut checksum = 0;
let mut maxflips = 0;
for fut in futures.into_iter() {
let (cs, mf) = fut.join().unwrap();
checksum += cs;
maxflips = cmp::max(maxflips, mf);
}
(checksum, maxflips)
}
fn main() {
let n = std::env::args_os().nth(1)
.and_then(|s| s.into_string().ok())
.and_then(|n| n.parse().ok())
.unwrap_or(7);
let (checksum, maxflips) = fannkuch(n);
println!("{}\nPfannkuchen({}) = {}", checksum, n, maxflips);
}

233
third_party/rust/simd/examples/fannkuch-redux.rs поставляемый Executable file
Просмотреть файл

@ -0,0 +1,233 @@
#![feature(cfg_target_feature)]
extern crate simd;
#[macro_use] extern crate cfg_if;
use simd::u8x16;
use std::{env, process};
cfg_if! {
if #[cfg(target_arch = "aarch64")] {
#[inline(always)]
fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
use simd::aarch64::neon::*;
y.table_lookup_1(x)
}
} else if #[cfg(all(target_arch = "arm",
target_feature = "neon"))] {
#[inline(always)]
fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
use simd::arm::neon::*;
#[inline(always)]
fn split(x: u8x16) -> (u8x8, u8x8) {
unsafe {std::mem::transmute(x)}
}
fn join(x: u8x8, y: u8x8) -> u8x16 {
unsafe {std::mem::transmute((x, y))}
}
let (t0, t1) = split(x);
let (i0, i1) = split(y);
join(i0.table_lookup_2(t0, t1),
i1.table_lookup_2(t0, t1))
}
} else if #[cfg(target_feature = "ssse3")] {
#[inline(always)]
fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
use simd::x86::ssse3::*;
x.shuffle_bytes(y)
}
} else {
// slow fallback, so tests work
#[inline(always)]
fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
u8x16::new(x.extract(y.extract(0) as u32),
x.extract(y.extract(1) as u32),
x.extract(y.extract(2) as u32),
x.extract(y.extract(3) as u32),
x.extract(y.extract(4) as u32),
x.extract(y.extract(5) as u32),
x.extract(y.extract(6) as u32),
x.extract(y.extract(7) as u32),
x.extract(y.extract(8) as u32),
x.extract(y.extract(9) as u32),
x.extract(y.extract(10) as u32),
x.extract(y.extract(11) as u32),
x.extract(y.extract(12) as u32),
x.extract(y.extract(13) as u32),
x.extract(y.extract(14) as u32),
x.extract(y.extract(15) as u32))
}
}
}
struct State {
s: [u8; 16],
flip_masks: [u8x16; 16],
rotate_masks: [u8x16; 16],
maxflips: i32,
odd: u16,
checksum: i32,
}
impl State {
fn new() -> State {
State {
s: [0; 16],
flip_masks: [u8x16::splat(0); 16],
rotate_masks: [u8x16::splat(0); 16],
maxflips: 0,
odd: 0,
checksum: 0,
}
}
#[inline(never)]
fn rotate_sisd(&mut self, n: usize) {
let c = self.s[0];
for i in 1..(n + 1) {
self.s[i - 1] = self.s[i];
}
self.s[n] = c;
}
#[inline(never)]
fn popmasks(&mut self) {
let mut mask = [0_u8; 16];
for i in 0..16 {
for j in 0..16 { mask[j] = j as u8; }
for x in 0..(i+1)/2 {
mask.swap(x, i - x);
}
self.flip_masks[i] = u8x16::load(&mask, 0);
for j in 0..16 { self.s[j] = j as u8; }
self.rotate_sisd(i);
self.rotate_masks[i] = self.load_s();
}
}
fn rotate(&mut self, n: usize) {
shuffle(self.load_s(), self.rotate_masks[n]).store(&mut self.s, 0)
}
fn load_s(&self) -> u8x16 {
u8x16::load(&self.s, 0)
}
#[inline(never)]
fn tk(&mut self, n: usize) {
#[derive(Copy, Clone, Debug)]
struct Perm {
perm: u8x16,
start: u8,
odd: u16
}
let mut perms = [Perm { perm: u8x16::splat(0), start: 0 , odd: 0 }; 60];
let mut i = 0;
let mut c = [0_u8; 16];
let mut perm_max = 0;
while i < n {
while i < n && perm_max < 60 {
self.rotate(i);
if c[i] as usize >= i {
c[i] = 0;
i += 1;
continue
}
c[i] += 1;
i = 1;
self.odd = !self.odd;
if self.s[0] != 0 {
if self.s[self.s[0] as usize] != 0 {
perms[perm_max].perm = self.load_s();
perms[perm_max].start = self.s[0];
perms[perm_max].odd = self.odd;
perm_max += 1;
} else {
if self.maxflips == 0 { self.maxflips = 1 }
self.checksum += if self.odd != 0 { -1 } else { 1 };
}
}
}
let mut k = 0;
while k < std::cmp::max(1, perm_max) - 1 {
let pk = &perms[k];
let pk1 = &perms[k + 1];
//println!("perm1 {:?}\nperm2 {:?}", pk.perm, pk1.perm);
let mut perm1 = pk.perm;
let mut perm2 = pk1.perm;
let mut f1 = 0;
let mut f2 = 0;
let mut toterm1 = pk.start;
let mut toterm2 = pk1.start;
while toterm1 != 0 && toterm2 != 0 {
perm1 = shuffle(perm1, self.flip_masks[toterm1 as usize]);
perm2 = shuffle(perm2, self.flip_masks[toterm2 as usize]);
toterm1 = perm1.extract(0);
toterm2 = perm2.extract(0);
f1 += 1; f2 += 1;
}
while toterm1 != 0 {
perm1 = shuffle(perm1, self.flip_masks[toterm1 as usize]);
toterm1 = perm1.extract(0);
f1 += 1;
}
while toterm2 != 0 {
perm2 = shuffle(perm2, self.flip_masks[toterm2 as usize]);
toterm2 = perm2.extract(0);
f2 += 1;
}
if f1 > self.maxflips { self.maxflips = f1 }
if f2 > self.maxflips { self.maxflips = f2 }
self.checksum += if pk.odd != 0 { -f1 } else { f1 };
self.checksum += if pk1.odd != 0 { -f2 } else { f2 };
k += 2;
}
while k < perm_max {
let pk = &perms[k];
let mut perm = pk.perm;
let mut f = 0;
let mut toterm = pk.start;
while toterm != 0 {
perm = shuffle(perm, self.flip_masks[toterm as usize]);
toterm = perm.extract(0);
f += 1;
}
if f > self.maxflips { self.maxflips = f }
self.checksum += if pk.odd != 0 { -f } else { f };
k += 1
}
perm_max = 0;
}
}
}
fn main() {
let mut state = State::new();
state.popmasks();
let args = env::args().collect::<Vec<_>>();
if args.len() < 2 {
println!("usage: {} number", args[0]);
process::exit(1)
}
let max_n = args[1].parse().unwrap();
if max_n < 3 || max_n > 15 {
println!("range: must be 3 <= n <= 14");
process::exit(1);
}
for i in 0..max_n { state.s[i] = i as u8 }
state.tk(max_n);
println!("{}\nPfannkuchen({}) = {}", state.checksum, max_n, state.maxflips);
}

125
third_party/rust/simd/examples/mandelbrot.rs поставляемый Executable file
Просмотреть файл

@ -0,0 +1,125 @@
#![feature(step_by, test)]
extern crate test;
extern crate simd;
use simd::{f32x4, u32x4};
use std::io::prelude::*;
#[inline(never)]
fn mandelbrot_naive(c_x: f32, c_y: f32, max_iter: u32) -> u32 {
let mut x = c_x;
let mut y = c_y;
let mut count = 0;
while count < max_iter {
let xy = x * y;
let xx = x * x;
let yy = y * y;
let sum = xx + yy;
if sum > 4.0 {
break
}
count += 1;
x = xx - yy + c_x;
y = xy * 2.0 + c_y;
}
count
}
#[inline(never)]
fn mandelbrot_vector(c_x: f32x4, c_y: f32x4, max_iter: u32) -> u32x4 {
let mut x = c_x;
let mut y = c_y;
let mut count = u32x4::splat(0);
for _ in 0..max_iter as usize {
let xy = x * y;
let xx = x * x;
let yy = y * y;
let sum = xx + yy;
let mask = sum.lt(f32x4::splat(4.0));
if !mask.any() { break }
count = count + mask.to_i().select(u32x4::splat(1),
u32x4::splat(0));
x = xx - yy + c_x;
y = xy + xy + c_y;
}
count
}
const COLOURS: &'static [(f32, f32, f32)] = &[(0.0, 7.0, 100.0),
(32.0, 107.0, 203.0),
(237.0, 255.0, 255.0),
(255.0, 170.0, 0.0),
(0.0, 2.0, 0.0)];
const SCALE: f32 = 12.0;
const LIMIT: u32 = 100;
#[inline(never)]
fn output_one(buf: &mut [u8], val: u32) {
let (r, g, b);
if val == LIMIT {
r = 0;
g = 0;
b = 0;
} else {
let val = (val as f32 % SCALE) * (COLOURS.len() as f32) / SCALE;
let left = val as usize % COLOURS.len();
let right = (left + 1) % COLOURS.len();
let p = val - left as f32;
let (r1, g1, b1) = COLOURS[left];
let (r2, g2, b2) = COLOURS[right];
r = (r1 + (r2 - r1) * p) as u8;
g = (g1 + (g2 - g1) * p) as u8;
b = (b1 + (b2 - b1) * p) as u8;
}
buf[0] = r;
buf[1] = g;
buf[2] = b;
}
fn main() {
let mut args = std::env::args();
args.next();
let width = args.next().unwrap().parse().unwrap();
let height = args.next().unwrap().parse().unwrap();
let left = -2.2;
let right = left + 3.0;
let top = 1.0;
let bottom = top - 2.0;
let width_step: f32 = (right - left) / width as f32;
let height_step: f32 = (bottom - top) / height as f32;
let adjust = f32x4::splat(width_step) * f32x4::new(0., 1., 2., 3.);
println!("P6 {} {} 255", width, height);
let mut line = vec![0; width * 3];
if args.next().is_none() {
for i in 0..height {
let y = f32x4::splat(top + height_step * i as f32);
for j in (0..width).step_by(4) {
let x = f32x4::splat(left + width_step * j as f32) + adjust;
let ret = mandelbrot_vector(x, y, LIMIT);
test::black_box(ret);
for k in 0..4 { let val = ret.extract(k as u32); output_one(&mut line[3*(j + k)..3*(j + k + 1)], val); }
}
::std::io::stdout().write(&line).unwrap();
}
} else {
for i in 0..height {
let y = top + height_step * i as f32;
for j in 0..width {
let x = left + width_step * j as f32;
let val = mandelbrot_naive(x, y, LIMIT);
test::black_box(val);
output_one(&mut line[3*j..3*(j + 1)], val);
}
::std::io::stdout().write(&line).unwrap();
}
}
}

280
third_party/rust/simd/examples/matrix-inverse.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,280 @@
extern crate simd;
use simd::f32x4;
fn mul(x: &[f32x4; 4], y: &[f32x4; 4]) -> [f32x4; 4] {
let y0 = y[0];
let y1 = y[1];
let y2 = y[2];
let y3 = y[3];
[f32x4::splat(y0.extract(0)) * x[0] +
f32x4::splat(y0.extract(1)) * x[1] +
f32x4::splat(y0.extract(2)) * x[2] +
f32x4::splat(y0.extract(3)) * x[3],
f32x4::splat(y1.extract(0)) * x[0] +
f32x4::splat(y1.extract(1)) * x[1] +
f32x4::splat(y1.extract(2)) * x[2] +
f32x4::splat(y1.extract(3)) * x[3],
f32x4::splat(y2.extract(0)) * x[0] +
f32x4::splat(y2.extract(1)) * x[1] +
f32x4::splat(y2.extract(2)) * x[2] +
f32x4::splat(y2.extract(3)) * x[3],
f32x4::splat(y3.extract(0)) * x[0] +
f32x4::splat(y3.extract(1)) * x[1] +
f32x4::splat(y3.extract(2)) * x[2] +
f32x4::splat(y3.extract(3)) * x[3],
]
}
fn inverse_naive(x: &[[f32; 4]; 4]) -> [[f32; 4]; 4] {
let mut t = [[0_f32; 4]; 4];
for i in 0..4 {
t[0][i] = x[i][0];
t[1][i] = x[i][1];
t[2][i] = x[i][2];
t[3][i] = x[i][3];
}
println!("{:?}", t);
let _0 = t[2][2] * t[3][3];
let _1 = t[2][3] * t[3][2];
let _2 = t[2][1] * t[3][3];
let _3 = t[2][3] * t[3][1];
let _4 = t[2][1] * t[3][2];
let _5 = t[2][2] * t[3][1];
let _6 = t[2][0] * t[3][3];
let _7 = t[2][3] * t[3][0];
let _8 = t[2][0] * t[3][2];
let _9 = t[2][2] * t[3][0];
let _10 = t[2][0] * t[3][1];
let _11 = t[2][1] * t[3][0];
let v = [_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11];
println!("{:?}", v);
let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
(_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
(_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
(_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
(_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
(_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
(_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
(_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
(_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);
println!("{:?}", [d00, d01, d02, d03, d10, d11, d12, d13]);
let _0 = t[0][2] * t[1][3];
let _1 = t[0][3] * t[1][2];
let _2 = t[0][1] * t[1][3];
let _3 = t[0][3] * t[1][1];
let _4 = t[0][1] * t[1][2];
let _5 = t[0][2] * t[1][1];
let _6 = t[0][0] * t[1][3];
let _7 = t[0][3] * t[1][0];
let _8 = t[0][0] * t[1][2];
let _9 = t[0][2] * t[1][0];
let _10 = t[0][0] * t[1][1];
let _11 = t[0][1] * t[1][0];
let v = [_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11];
println!("{:?}", v);
let d20 = _0*t[3][1] + _3*t[3][2] + _4*t[3][3]-
(_1*t[3][1] + _2*t[3][2] + _5*t[3][3]);
let d21 = _1*t[3][0] + _6*t[3][2] + _9*t[3][3]-
(_0*t[3][0] + _7*t[3][2] + _8*t[3][3]);
let d22 = _2*t[3][0] + _7*t[3][1] + _10*t[3][3]-
(_3*t[3][0] + _6*t[3][1] + _11*t[3][3]);
let d23 = _5*t[3][0] + _8*t[3][1] + _11*t[3][2]-
(_4*t[3][0] + _9*t[3][1] + _10*t[3][2]);
let d30 = _2*t[2][2] + _5*t[2][3] + _1*t[2][1]-
(_4*t[2][3] + _0*t[2][1] + _3*t[2][2]);
let d31 = _8*t[2][3] + _0*t[2][0] + _7*t[2][2]-
(_6*t[2][2] + _9*t[2][3] + _1*t[2][0]);
let d32 = _6*t[2][1] + _11*t[2][3] + _3*t[2][0]-
(_10*t[2][3] + _2*t[2][0] + _7*t[2][1]);
let d33 = _10*t[2][2] + _4*t[2][0] + _9*t[2][1]-
(_8*t[2][1] + _11*t[2][2] + _5*t[2][0]);
println!("{:?}", [d20, d21, d22, d23, d30, d31, d32, d33]);
let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;
let det = 1.0 / det;
let mut ret = [[d00, d01, d02, d03],
[d10, d11, d12, d13],
[d20, d21, d22, d23],
[d30, d31, d32, d33]];
for i in 0..4 {
for j in 0..4 {
ret[i][j] *= det;
}
}
ret
}
fn inverse_simd4(x: &[f32x4; 4]) -> [f32x4; 4] {
let src0 = x[0];
let src1 = x[1];
let src2 = x[2];
let src3 = x[3];
let tmp1 = f32x4::new(src0.extract(0), src0.extract(1),
src1.extract(4 - 4), src1.extract(5 - 4));
let row1 = f32x4::new(src2.extract(0), src2.extract(1),
src3.extract(4 - 4), src3.extract(5 - 4));
let row0 = f32x4::new(tmp1.extract(0), tmp1.extract(2),
row1.extract(4 - 4), row1.extract(6 - 4));
let row1 = f32x4::new(row1.extract(1), row1.extract(3),
tmp1.extract(5 - 4), tmp1.extract(7 - 4));
let tmp1 = f32x4::new(src0.extract(2), src0.extract(3),
src1.extract(6 - 4), src1.extract(7 - 4));
let row3 = f32x4::new(src2.extract(2), src2.extract(3),
src3.extract(6 - 4), src3.extract(7 - 4));
let row2 = f32x4::new(tmp1.extract(0), tmp1.extract(2),
row3.extract(4 - 4), row3.extract(6 - 4));
let row3 = f32x4::new(row3.extract(1), row3.extract(3),
tmp1.extract(5 - 4), tmp1.extract(7 - 4));
let tmp1 = row2 * row3;
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let minor0 = row1 * tmp1;
let minor1 = row0 * tmp1;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor0 = (row1 * tmp1) - minor0;
let minor1 = (row0 * tmp1) - minor1;
let minor1 = f32x4::new(minor1.extract(2), minor1.extract(3),
minor1.extract(0), minor1.extract(1));
//println!("{:?}", minor1);
let tmp1 = row1 * row2;
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let minor0 = (row3 * tmp1) + minor0;
let minor3 = row0 * tmp1;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor0 = minor0 - row3 * tmp1;
let minor3 = row0 * tmp1 - minor3;
let minor3 = f32x4::new(minor3.extract(2), minor3.extract(3),
minor3.extract(0), minor3.extract(1));
//println!("{:?}", minor1);
let tmp1 = row3 * f32x4::new(row1.extract(2), row1.extract(3),
row1.extract(0), row1.extract(1));
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let row2 = f32x4::new(row2.extract(2), row2.extract(3),
row2.extract(0), row2.extract(1));
let minor0 = row2 * tmp1 + minor0;
let minor2 = row0 * tmp1;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor0 = minor0 - row2 * tmp1;
let minor2 = row0 * tmp1 - minor2;
let minor2 = f32x4::new(minor2.extract(2), minor2.extract(3),
minor2.extract(0), minor2.extract(1));
//println!("{:?}", minor1);
let tmp1 = row0 * row1;
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let minor2 = minor2 + row3 * tmp1;
let minor3 = row2 * tmp1 - minor3;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor2 = row3 * tmp1 - minor2;
let minor3 = minor3 - row2 * tmp1;
//println!("{:?}", minor1);
let tmp1 = row0 * row3;
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let minor1 = minor1 - row2 * tmp1;
let minor2 = row1 * tmp1 + minor2;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor1 = row2 * tmp1 + minor1;
let minor2 = minor2 - row1 * tmp1;
//println!("{:?}", minor1);
let tmp1 = row0 * row2;
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let minor1 = row3 * tmp1 + minor1;
let minor3 = minor3 - row1 * tmp1;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor1 = minor1 - row3 * tmp1;
let minor3 = row1 * tmp1 + minor3;
//println!("{:?}", minor1);
let det = row0 * minor0;
let det = f32x4::new(det.extract(2), det.extract(3),
det.extract(0), det.extract(1)) + det;
let det = f32x4::new(det.extract(1), det.extract(0),
det.extract(3), det.extract(2)) + det;
let tmp1 = det.approx_reciprocal();
let det = tmp1 + tmp1 - det * tmp1 * tmp1;
// let det = f32x4::splat(det.extract(0));
[minor0 * det, minor1 * det, minor2 * det, minor3 * det]
}
fn p(x: &[f32x4; 4]) {
for xx in x {
for i in 0..4 {
let v = xx.extract(i);
if v == 0.0 {
print!("{}{:6.2}", if i > 0 {", "} else {"|"}, "");
} else {
print!("{}{:6.2}", if i > 0 {", "} else {"|"}, xx.extract(i));
}
}
println!(" |");
}
}
fn main() {
let x = [f32x4::new(-100.0, 6.0, 100.0, 1.0),
f32x4::new(3.0, 1.0, 0.0, 1.0),
f32x4::new(2.0, 1.0, 1.0, 1.0),
f32x4::new(-10.0, 1.0, 1.0, 1.0)];
/* let mut x_ = [[0.0; 4]; 4];
for i in 0..4 {
for j in 0..4 {
x_[i][j] = x[i].extract(j as u32)
}
}
let ret = inverse_naive(&x_);
let mut y = [f32x4::splat(0.0); 4];
for i in 0..4 {
for j in 0..4 {
y[i] = y[i].replace(j as u32, ret[i][j])
}
}*/
let y = inverse_simd4(&x);
p(&x);
println!("");
p(&y);
println!("");
p(&mul(&x, &y))
}

156
third_party/rust/simd/examples/nbody-nosimd.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,156 @@
// The Computer Language Benchmarks Game
// http://benchmarksgame.alioth.debian.org/
//
// contributed by the Rust Project Developers
// contributed by TeXitoi
const PI: f64 = 3.141592653589793;
const SOLAR_MASS: f64 = 4.0 * PI * PI;
const YEAR: f64 = 365.24;
const N_BODIES: usize = 5;
static BODIES: [Planet;N_BODIES] = [
// Sun
Planet {
x: 0.0, y: 0.0, z: 0.0,
vx: 0.0, vy: 0.0, vz: 0.0,
mass: SOLAR_MASS,
},
// Jupiter
Planet {
x: 4.84143144246472090e+00,
y: -1.16032004402742839e+00,
z: -1.03622044471123109e-01,
vx: 1.66007664274403694e-03 * YEAR,
vy: 7.69901118419740425e-03 * YEAR,
vz: -6.90460016972063023e-05 * YEAR,
mass: 9.54791938424326609e-04 * SOLAR_MASS,
},
// Saturn
Planet {
x: 8.34336671824457987e+00,
y: 4.12479856412430479e+00,
z: -4.03523417114321381e-01,
vx: -2.76742510726862411e-03 * YEAR,
vy: 4.99852801234917238e-03 * YEAR,
vz: 2.30417297573763929e-05 * YEAR,
mass: 2.85885980666130812e-04 * SOLAR_MASS,
},
// Uranus
Planet {
x: 1.28943695621391310e+01,
y: -1.51111514016986312e+01,
z: -2.23307578892655734e-01,
vx: 2.96460137564761618e-03 * YEAR,
vy: 2.37847173959480950e-03 * YEAR,
vz: -2.96589568540237556e-05 * YEAR,
mass: 4.36624404335156298e-05 * SOLAR_MASS,
},
// Neptune
Planet {
x: 1.53796971148509165e+01,
y: -2.59193146099879641e+01,
z: 1.79258772950371181e-01,
vx: 2.68067772490389322e-03 * YEAR,
vy: 1.62824170038242295e-03 * YEAR,
vz: -9.51592254519715870e-05 * YEAR,
mass: 5.15138902046611451e-05 * SOLAR_MASS,
},
];
#[derive(Clone, Copy)]
struct Planet {
x: f64, y: f64, z: f64,
vx: f64, vy: f64, vz: f64,
mass: f64,
}
fn advance(bodies: &mut [Planet;N_BODIES], dt: f64, steps: i32) {
for _ in (0..steps) {
let mut b_slice: &mut [_] = bodies;
loop {
let bi = match shift_mut_ref(&mut b_slice) {
Some(bi) => bi,
None => break
};
for bj in b_slice.iter_mut() {
let dx = bi.x - bj.x;
let dy = bi.y - bj.y;
let dz = bi.z - bj.z;
let d2 = dx * dx + dy * dy + dz * dz;
let mag = dt / (d2 * d2.sqrt());
let massj_mag = bj.mass * mag;
bi.vx -= dx * massj_mag;
bi.vy -= dy * massj_mag;
bi.vz -= dz * massj_mag;
let massi_mag = bi.mass * mag;
bj.vx += dx * massi_mag;
bj.vy += dy * massi_mag;
bj.vz += dz * massi_mag;
}
bi.x += dt * bi.vx;
bi.y += dt * bi.vy;
bi.z += dt * bi.vz;
}
}
}
fn energy(bodies: &[Planet;N_BODIES]) -> f64 {
let mut e = 0.0;
let mut bodies = bodies.iter();
loop {
let bi = match bodies.next() {
Some(bi) => bi,
None => break
};
e += (bi.vx * bi.vx + bi.vy * bi.vy + bi.vz * bi.vz) * bi.mass / 2.0;
for bj in bodies.clone() {
let dx = bi.x - bj.x;
let dy = bi.y - bj.y;
let dz = bi.z - bj.z;
let dist = (dx * dx + dy * dy + dz * dz).sqrt();
e -= bi.mass * bj.mass / dist;
}
}
e
}
fn offset_momentum(bodies: &mut [Planet;N_BODIES]) {
let mut px = 0.0;
let mut py = 0.0;
let mut pz = 0.0;
for bi in bodies.iter() {
px += bi.vx * bi.mass;
py += bi.vy * bi.mass;
pz += bi.vz * bi.mass;
}
let sun = &mut bodies[0];
sun.vx = - px / SOLAR_MASS;
sun.vy = - py / SOLAR_MASS;
sun.vz = - pz / SOLAR_MASS;
}
fn main() {
let n = std::env::args().nth(1).expect("need one arg").parse().unwrap();
let mut bodies = BODIES;
offset_momentum(&mut bodies);
println!("{:.9}", energy(&bodies));
advance(&mut bodies, 0.01, n);
println!("{:.9}", energy(&bodies));
}
/// Pop a mutable reference off the head of a slice, mutating the slice to no
/// longer contain the mutable reference.
fn shift_mut_ref<'a, T>(r: &mut &'a mut [T]) -> Option<&'a mut T> {
if r.len() == 0 { return None }
let tmp = std::mem::replace(r, &mut []);
let (h, t) = tmp.split_at_mut(1);
*r = t;
Some(&mut h[0])
}

170
third_party/rust/simd/examples/nbody.rs поставляемый Executable file
Просмотреть файл

@ -0,0 +1,170 @@
#![feature(cfg_target_feature)]
extern crate simd;
#[cfg(target_feature = "sse2")]
use simd::x86::sse2::*;
#[cfg(target_arch = "aarch64")]
use simd::aarch64::neon::*;
const PI: f64 = 3.141592653589793;
const SOLAR_MASS: f64 = 4.0 * PI * PI;
const DAYS_PER_YEAR: f64 = 365.24;
struct Body {
x: [f64; 3],
_fill: f64,
v: [f64; 3],
mass: f64,
}
impl Body {
fn new(x0: f64, x1: f64, x2: f64,
v0: f64, v1: f64, v2: f64,
mass: f64) -> Body {
Body {
x: [x0, x1, x2],
_fill: 0.0,
v: [v0, v1, v2],
mass: mass,
}
}
}
const N_BODIES: usize = 5;
const N: usize = N_BODIES * (N_BODIES - 1) / 2;
fn offset_momentum(bodies: &mut [Body; N_BODIES]) {
let (sun, rest) = bodies.split_at_mut(1);
let sun = &mut sun[0];
for body in rest {
for k in 0..3 {
sun.v[k] -= body.v[k] * body.mass / SOLAR_MASS;
}
}
}
fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
let mut r = [[0.0; 4]; N];
let mut mag = [0.0; N];
let mut dx = [f64x2::splat(0.0); 3];
let mut dsquared;
let mut distance;
let mut dmag;
let mut i = 0;
for j in 0..N_BODIES {
for k in j+1..N_BODIES {
for m in 0..3 {
r[i][m] = bodies[j].x[m] - bodies[k].x[m];
}
i += 1;
}
}
i = 0;
while i < N {
for m in 0..3 {
dx[m] = f64x2::new(r[i][m], r[i+1][m]);
}
dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
distance = dsquared.to_f32().approx_rsqrt().to_f64();
for _ in 0..2 {
distance = distance * f64x2::splat(1.5) -
((f64x2::splat(0.5) * dsquared) * distance) * (distance * distance)
}
dmag = f64x2::splat(dt) / dsquared * distance;
dmag.store(&mut mag, i);
i += 2;
}
i = 0;
for j in 0..N_BODIES {
for k in j+1..N_BODIES {
for m in 0..3 {
bodies[j].v[m] -= r[i][m] * bodies[k].mass * mag[i];
bodies[k].v[m] += r[i][m] * bodies[j].mass * mag[i];
}
i += 1
}
}
for body in bodies {
for m in 0..3 {
body.x[m] += dt * body.v[m]
}
}
}
fn energy(bodies: &[Body; N_BODIES]) -> f64 {
let mut e = 0.0;
for i in 0..N_BODIES {
let bi = &bodies[i];
e += bi.mass * (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2]) / 2.0;
for j in i+1..N_BODIES {
let bj = &bodies[j];
let mut dx = [0.0; 3];
for k in 0..3 {
dx[k] = bi.x[k] - bj.x[k];
}
let mut distance = 0.0;
for &d in &dx { distance += d * d }
e -= bi.mass * bj.mass / distance.sqrt()
}
}
e
}
fn main() {
let mut bodies: [Body; N_BODIES] = [
/* sun */
Body::new(0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
SOLAR_MASS),
/* jupiter */
Body::new(4.84143144246472090e+00,
-1.16032004402742839e+00,
-1.03622044471123109e-01 ,
1.66007664274403694e-03 * DAYS_PER_YEAR,
7.69901118419740425e-03 * DAYS_PER_YEAR,
-6.90460016972063023e-05 * DAYS_PER_YEAR ,
9.54791938424326609e-04 * SOLAR_MASS
),
/* saturn */
Body::new(8.34336671824457987e+00,
4.12479856412430479e+00,
-4.03523417114321381e-01 ,
-2.76742510726862411e-03 * DAYS_PER_YEAR,
4.99852801234917238e-03 * DAYS_PER_YEAR,
2.30417297573763929e-05 * DAYS_PER_YEAR ,
2.85885980666130812e-04 * SOLAR_MASS
),
/* uranus */
Body::new(1.28943695621391310e+01,
-1.51111514016986312e+01,
-2.23307578892655734e-01 ,
2.96460137564761618e-03 * DAYS_PER_YEAR,
2.37847173959480950e-03 * DAYS_PER_YEAR,
-2.96589568540237556e-05 * DAYS_PER_YEAR ,
4.36624404335156298e-05 * SOLAR_MASS
),
/* neptune */
Body::new(1.53796971148509165e+01,
-2.59193146099879641e+01,
1.79258772950371181e-01 ,
2.68067772490389322e-03 * DAYS_PER_YEAR,
1.62824170038242295e-03 * DAYS_PER_YEAR,
-9.51592254519715870e-05 * DAYS_PER_YEAR ,
5.15138902046611451e-05 * SOLAR_MASS
)
];
let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
offset_momentum(&mut bodies);
println!("{:.9}", energy(&bodies));
for _ in 0..n {
advance(&mut bodies, 0.01);
}
println!("{:.9}", energy(&bodies));
}

9
third_party/rust/simd/examples/ops.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,9 @@
extern crate simd;
use simd::*;
fn main() {
let x = i32x4::splat(1_i32);
let y = -x;
let z = !x;
}

106
third_party/rust/simd/examples/spectral-norm-nosimd.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,106 @@
// The Computer Language Benchmarks Game
// http://benchmarksgame.alioth.debian.org/
//
// contributed by the Rust Project Developers
// contributed by TeXitoi
#![allow(non_snake_case)]
use std::iter::repeat;
//use std::thread;
// As std::simd::f64x2 is unstable, we provide a similar interface,
// expecting llvm to autovectorize its usage.
#[allow(non_camel_case_types)]
struct f64x2(f64, f64);
impl std::ops::Add for f64x2 {
type Output = Self;
fn add(self, rhs: Self) -> Self {
f64x2(self.0 + rhs.0, self.1 + rhs.1)
}
}
impl std::ops::Div for f64x2 {
type Output = Self;
fn div(self, rhs: Self) -> Self {
f64x2(self.0 / rhs.0, self.1 / rhs.1)
}
}
fn main() {
let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
let answer = spectralnorm(n);
println!("{:.9}", answer);
}
fn spectralnorm(n: usize) -> f64 {
assert!(n % 2 == 0, "only even lengths are accepted");
let mut u = repeat(1.0).take(n).collect::<Vec<_>>();
let mut v = u.clone();
let mut tmp = v.clone();
for _ in 0..10 {
mult_AtAv(&u, &mut v, &mut tmp);
mult_AtAv(&v, &mut u, &mut tmp);
}
(dot(&u, &v) / dot(&v, &v)).sqrt()
}
fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
mult_Av(v, tmp);
mult_Atv(tmp, out);
}
fn mult_Av(v: &[f64], out: &mut [f64]) {
parallel(out, |start, out| mult(v, out, start, |i, j| A(i, j)));
}
fn mult_Atv(v: &[f64], out: &mut [f64]) {
parallel(out, |start, out| mult(v, out, start, |i, j| A(j, i)));
}
fn mult<F>(v: &[f64], out: &mut [f64], start: usize, a: F)
where F: Fn(usize, usize) -> f64 {
for (i, slot) in out.iter_mut().enumerate().map(|(i, s)| (i + start, s)) {
let mut sum = f64x2(0.0, 0.0);
for (j, chunk) in v.chunks(2).enumerate().map(|(j, s)| (2 * j, s)) {
let top = f64x2(chunk[0], chunk[1]);
let bot = f64x2(a(i, j), a(i, j + 1));
sum = sum + top / bot;
}
let f64x2(a, b) = sum;
*slot = a + b;
}
}
fn A(i: usize, j: usize) -> f64 {
((i + j) * (i + j + 1) / 2 + i + 1) as f64
}
fn dot(v: &[f64], u: &[f64]) -> f64 {
v.iter().zip(u.iter()).map(|(a, b)| *a * *b).fold(0., |acc, i| acc + i)
}
//struct Racy<T>(T);
//unsafe impl<T: 'static> Send for Racy<T> {}
// Executes a closure in parallel over the given mutable slice. The closure `f`
// is run in parallel and yielded the starting index within `v` as well as a
// sub-slice of `v`.
fn parallel<'a, T, F>(v: &mut [T], ref f: F)
where T: 'static + Send + Sync,
F: Fn(usize, &mut [T]) + Sync
{
f(0, v);
/*let size = v.len() / 4 + 1;
let jhs = v.chunks_mut(size).enumerate().map(|(i, chunk)| {
// Need to convert `f` and `chunk` to something that can cross the task
// boundary.
let f = Racy(f as *const F as *const usize);
let raw = Racy((&mut chunk[0] as *mut T, chunk.len()));
thread::spawn(move|| {
let f = f.0 as *const F;
let raw = raw.0;
unsafe { (*f)(i * size, std::slice::from_raw_parts_mut(raw.0, raw.1)) }
})
}).collect::<Vec<_>>();
for jh in jhs { jh.join().unwrap(); }*/
}

74
third_party/rust/simd/examples/spectral-norm.rs поставляемый Executable file
Просмотреть файл

@ -0,0 +1,74 @@
#![feature(cfg_target_feature)]
#![allow(non_snake_case)]
extern crate simd;
#[cfg(target_feature = "sse2")]
use simd::x86::sse2::f64x2;
#[cfg(target_arch = "aarch64")]
use simd::aarch64::neon::f64x2;
fn A(i: usize, j: usize) -> f64 {
((i + j) * (i + j + 1) / 2 + i + 1) as f64
}
fn dot(x: &[f64], y: &[f64]) -> f64 {
x.iter().zip(y).map(|(&x, &y)| x * y).fold(0.0, |a, b| a + b)
}
fn mult_Av(v: &[f64], out: &mut [f64]) {
assert!(v.len() == out.len());
assert!(v.len() % 2 == 0);
for i in 0..v.len() {
let mut sum = f64x2::splat(0.0);
let mut j = 0;
while j < v.len() {
let b = f64x2::load(v, j);
let a = f64x2::new(A(i, j), A(i, j + 1));
sum = sum + b / a;
j += 2
}
out[i] = sum.extract(0) + sum.extract(1);
}
}
fn mult_Atv(v: &[f64], out: &mut [f64]) {
assert!(v.len() == out.len());
assert!(v.len() % 2 == 0);
for i in 0..v.len() {
let mut sum = f64x2::splat(0.0);
let mut j = 0;
while j < v.len() {
let b = f64x2::load(v, j);
let a = f64x2::new(A(j, i), A(j + 1, i));
sum = sum + b / a;
j += 2
}
out[i] = sum.extract(0) + sum.extract(1);
}
}
fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
mult_Av(v, tmp);
mult_Atv(tmp, out);
}
fn main() {
let mut n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
if n % 2 == 1 { n += 1 }
let mut u = vec![1.0; n];
let mut v = u.clone();
let mut tmp = u.clone();
for _ in 0..10 {
mult_AtAv(&u, &mut v, &mut tmp);
mult_AtAv(&v, &mut u, &mut tmp);
}
println!("{:.9}", (dot(&u, &v) / dot(&v, &v)).sqrt());
}

3
third_party/rust/simd/src/aarch64/mod.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,3 @@
//! Features specific to AArch64 CPUs.
pub mod neon;

681
third_party/rust/simd/src/aarch64/neon.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,681 @@
use super::super::*;
use {simd_cast, f32x2};
pub use sixty_four::{f64x2, i64x2, u64x2, bool64ix2, bool64fx2};
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct u32x2(u32, u32);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct i32x2(i32, i32);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct u16x4(u16, u16, u16, u16);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct i16x4(i16, i16, i16, i16);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct u8x8(u8, u8, u8, u8,
u8, u8, u8, u8);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct i8x8(i8, i8, i8, i8,
i8, i8, i8, i8);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct i64x1(i64);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct u64x1(u64);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct f64x1(f64);
#[allow(dead_code)]
extern "platform-intrinsic" {
fn aarch64_vhadd_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vhadd_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vhadd_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vhadd_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vhadd_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vhadd_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vrhadd_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vrhadd_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vrhadd_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vrhadd_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vrhadd_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vrhadd_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vrhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vrhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vrhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vrhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vrhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vrhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vqadd_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vqadd_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vqadd_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vqadd_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vqadd_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vqadd_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vqadd_s64(x: i64x1, y: i64x1) -> i64x1;
fn aarch64_vqadd_u64(x: u64x1, y: u64x1) -> u64x1;
fn aarch64_vqaddq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vqaddq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vqaddq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vqaddq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vqaddq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vqaddq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vqaddq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vqaddq_u64(x: u64x2, y: u64x2) -> u64x2;
fn aarch64_vuqadd_s8(x: i8x16, y: u8x16) -> i8x16;
fn aarch64_vuqadd_s16(x: i16x8, y: u16x8) -> i16x8;
fn aarch64_vuqadd_s32(x: i32x4, y: u32x4) -> i32x4;
fn aarch64_vuqadd_s64(x: i64x2, y: u64x2) -> i64x2;
fn aarch64_vsqadd_u8(x: u8x16, y: i8x16) -> u8x16;
fn aarch64_vsqadd_u16(x: u16x8, y: i16x8) -> u16x8;
fn aarch64_vsqadd_u32(x: u32x4, y: i32x4) -> u32x4;
fn aarch64_vsqadd_u64(x: u64x2, y: i64x2) -> u64x2;
fn aarch64_vraddhn_s16(x: i16x8, y: i16x8) -> i8x8;
fn aarch64_vraddhn_u16(x: u16x8, y: u16x8) -> u8x8;
fn aarch64_vraddhn_s32(x: i32x4, y: i32x4) -> i16x4;
fn aarch64_vraddhn_u32(x: u32x4, y: u32x4) -> u16x4;
fn aarch64_vraddhn_s64(x: i64x2, y: i64x2) -> i32x2;
fn aarch64_vraddhn_u64(x: u64x2, y: u64x2) -> u32x2;
fn aarch64_vfmulx_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vfmulx_f64(x: f64x1, y: f64x1) -> f64x1;
fn aarch64_vfmulxq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vfmulxq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vfma_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vfma_f64(x: f64x1, y: f64x1) -> f64x1;
fn aarch64_vfmaq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vfmaq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vqdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vqdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vqdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vqdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vqrdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vqrdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vqrdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vqrdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vmull_s8(x: i8x8, y: i8x8) -> i16x8;
fn aarch64_vmull_u8(x: u8x8, y: u8x8) -> u16x8;
fn aarch64_vmull_s16(x: i16x4, y: i16x4) -> i32x4;
fn aarch64_vmull_u16(x: u16x4, y: u16x4) -> u32x4;
fn aarch64_vmull_s32(x: i32x2, y: i32x2) -> i64x2;
fn aarch64_vmull_u32(x: u32x2, y: u32x2) -> u64x2;
fn aarch64_vqdmullq_s8(x: i8x8, y: i8x8) -> i16x8;
fn aarch64_vqdmullq_s16(x: i16x4, y: i16x4) -> i32x4;
fn aarch64_vhsub_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vhsub_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vhsub_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vhsub_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vhsub_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vhsub_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vhsubq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vhsubq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vhsubq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vhsubq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vhsubq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vhsubq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vqsub_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vqsub_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vqsub_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vqsub_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vqsub_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vqsub_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vqsub_s64(x: i64x1, y: i64x1) -> i64x1;
fn aarch64_vqsub_u64(x: u64x1, y: u64x1) -> u64x1;
fn aarch64_vqsubq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vqsubq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vqsubq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vqsubq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vqsubq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vqsubq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vqsubq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vqsubq_u64(x: u64x2, y: u64x2) -> u64x2;
fn aarch64_vrsubhn_s16(x: i16x8, y: i16x8) -> i8x8;
fn aarch64_vrsubhn_u16(x: u16x8, y: u16x8) -> u8x8;
fn aarch64_vrsubhn_s32(x: i32x4, y: i32x4) -> i16x4;
fn aarch64_vrsubhn_u32(x: u32x4, y: u32x4) -> u16x4;
fn aarch64_vrsubhn_s64(x: i64x2, y: i64x2) -> i32x2;
fn aarch64_vrsubhn_u64(x: u64x2, y: u64x2) -> u32x2;
fn aarch64_vabd_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vabd_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vabd_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vabd_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vabd_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vabd_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vabd_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vabd_f64(x: f64x1, y: f64x1) -> f64x1;
fn aarch64_vabdq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vabdq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vabdq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vabdq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vabdq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vabdq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vabdq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vabdq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vmax_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vmax_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vmax_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vmax_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vmax_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vmax_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vmax_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vmax_f64(x: f64x1, y: f64x1) -> f64x1;
fn aarch64_vmaxq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vmaxq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vmaxq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vmaxq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vmaxq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vmaxq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vmaxq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vmaxq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vmin_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vmin_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vmin_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vmin_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vmin_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vmin_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vmin_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vmin_f64(x: f64x1, y: f64x1) -> f64x1;
fn aarch64_vminq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vminq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vminq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vminq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vminq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vminq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vminq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vminq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vmaxnm_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vmaxnm_f64(x: f64x1, y: f64x1) -> f64x1;
fn aarch64_vmaxnmq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vmaxnmq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vminnm_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vminnm_f64(x: f64x1, y: f64x1) -> f64x1;
fn aarch64_vminnmq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vminnmq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vshl_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vshl_u8(x: u8x8, y: i8x8) -> u8x8;
fn aarch64_vshl_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vshl_u16(x: u16x4, y: i16x4) -> u16x4;
fn aarch64_vshl_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vshl_u32(x: u32x2, y: i32x2) -> u32x2;
fn aarch64_vshl_s64(x: i64x1, y: i64x1) -> i64x1;
fn aarch64_vshl_u64(x: u64x1, y: i64x1) -> u64x1;
fn aarch64_vshlq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vshlq_u8(x: u8x16, y: i8x16) -> u8x16;
fn aarch64_vshlq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vshlq_u16(x: u16x8, y: i16x8) -> u16x8;
fn aarch64_vshlq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vshlq_u32(x: u32x4, y: i32x4) -> u32x4;
fn aarch64_vshlq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vshlq_u64(x: u64x2, y: i64x2) -> u64x2;
fn aarch64_vqshl_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vqshl_u8(x: u8x8, y: i8x8) -> u8x8;
fn aarch64_vqshl_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vqshl_u16(x: u16x4, y: i16x4) -> u16x4;
fn aarch64_vqshl_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vqshl_u32(x: u32x2, y: i32x2) -> u32x2;
fn aarch64_vqshl_s64(x: i64x1, y: i64x1) -> i64x1;
fn aarch64_vqshl_u64(x: u64x1, y: i64x1) -> u64x1;
fn aarch64_vqshlq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vqshlq_u8(x: u8x16, y: i8x16) -> u8x16;
fn aarch64_vqshlq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vqshlq_u16(x: u16x8, y: i16x8) -> u16x8;
fn aarch64_vqshlq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vqshlq_u32(x: u32x4, y: i32x4) -> u32x4;
fn aarch64_vqshlq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vqshlq_u64(x: u64x2, y: i64x2) -> u64x2;
fn aarch64_vrshl_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vrshl_u8(x: u8x8, y: i8x8) -> u8x8;
fn aarch64_vrshl_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vrshl_u16(x: u16x4, y: i16x4) -> u16x4;
fn aarch64_vrshl_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vrshl_u32(x: u32x2, y: i32x2) -> u32x2;
fn aarch64_vrshl_s64(x: i64x1, y: i64x1) -> i64x1;
fn aarch64_vrshl_u64(x: u64x1, y: i64x1) -> u64x1;
fn aarch64_vrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
fn aarch64_vrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
fn aarch64_vrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
fn aarch64_vrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
fn aarch64_vqrshl_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vqrshl_u8(x: u8x8, y: i8x8) -> u8x8;
fn aarch64_vqrshl_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vqrshl_u16(x: u16x4, y: i16x4) -> u16x4;
fn aarch64_vqrshl_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vqrshl_u32(x: u32x2, y: i32x2) -> u32x2;
fn aarch64_vqrshl_s64(x: i64x1, y: i64x1) -> i64x1;
fn aarch64_vqrshl_u64(x: u64x1, y: i64x1) -> u64x1;
fn aarch64_vqrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vqrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
fn aarch64_vqrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vqrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
fn aarch64_vqrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vqrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
fn aarch64_vqrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vqrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
fn aarch64_vqshrun_n_s16(x: i16x8, y: u32) -> i8x8;
fn aarch64_vqshrun_n_s32(x: i32x4, y: u32) -> i16x4;
fn aarch64_vqshrun_n_s64(x: i64x2, y: u32) -> i32x2;
fn aarch64_vqrshrun_n_s16(x: i16x8, y: u32) -> i8x8;
fn aarch64_vqrshrun_n_s32(x: i32x4, y: u32) -> i16x4;
fn aarch64_vqrshrun_n_s64(x: i64x2, y: u32) -> i32x2;
fn aarch64_vqshrn_n_s16(x: i16x8, y: u32) -> i8x8;
fn aarch64_vqshrn_n_u16(x: u16x8, y: u32) -> u8x8;
fn aarch64_vqshrn_n_s32(x: i32x4, y: u32) -> i16x4;
fn aarch64_vqshrn_n_u32(x: u32x4, y: u32) -> u16x4;
fn aarch64_vqshrn_n_s64(x: i64x2, y: u32) -> i32x2;
fn aarch64_vqshrn_n_u64(x: u64x2, y: u32) -> u32x2;
fn aarch64_vrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
fn aarch64_vrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
fn aarch64_vrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
fn aarch64_vrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
fn aarch64_vrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
fn aarch64_vrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
fn aarch64_vqrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
fn aarch64_vqrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
fn aarch64_vqrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
fn aarch64_vqrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
fn aarch64_vqrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
fn aarch64_vqrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
fn aarch64_vsri_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vsri_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vsri_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vsri_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vsri_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vsri_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vsri_s64(x: i64x1, y: i64x1) -> i64x1;
fn aarch64_vsri_u64(x: u64x1, y: u64x1) -> u64x1;
fn aarch64_vsriq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vsriq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vsriq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vsriq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vsriq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vsriq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vsriq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vsriq_u64(x: u64x2, y: u64x2) -> u64x2;
fn aarch64_vsli_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vsli_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vsli_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vsli_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vsli_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vsli_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vsli_s64(x: i64x1, y: i64x1) -> i64x1;
fn aarch64_vsli_u64(x: u64x1, y: u64x1) -> u64x1;
fn aarch64_vsliq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vsliq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vsliq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vsliq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vsliq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vsliq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vsliq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vsliq_u64(x: u64x2, y: u64x2) -> u64x2;
fn aarch64_vvqmovn_s16(x: i16x8) -> i8x8;
fn aarch64_vvqmovn_u16(x: u16x8) -> u8x8;
fn aarch64_vvqmovn_s32(x: i32x4) -> i16x4;
fn aarch64_vvqmovn_u32(x: u32x4) -> u16x4;
fn aarch64_vvqmovn_s64(x: i64x2) -> i32x2;
fn aarch64_vvqmovn_u64(x: u64x2) -> u32x2;
fn aarch64_vabs_s8(x: i8x8) -> i8x8;
fn aarch64_vabs_s16(x: i16x4) -> i16x4;
fn aarch64_vabs_s32(x: i32x2) -> i32x2;
fn aarch64_vabs_s64(x: i64x1) -> i64x1;
fn aarch64_vabsq_s8(x: i8x16) -> i8x16;
fn aarch64_vabsq_s16(x: i16x8) -> i16x8;
fn aarch64_vabsq_s32(x: i32x4) -> i32x4;
fn aarch64_vabsq_s64(x: i64x2) -> i64x2;
fn aarch64_vabs_f32(x: f32x2) -> f32x2;
fn aarch64_vabs_f64(x: f64x1) -> f64x1;
fn aarch64_vabsq_f32(x: f32x4) -> f32x4;
fn aarch64_vabsq_f64(x: f64x2) -> f64x2;
fn aarch64_vqabs_s8(x: i8x8) -> i8x8;
fn aarch64_vqabs_s16(x: i16x4) -> i16x4;
fn aarch64_vqabs_s32(x: i32x2) -> i32x2;
fn aarch64_vqabs_s64(x: i64x1) -> i64x1;
fn aarch64_vqabsq_s8(x: i8x16) -> i8x16;
fn aarch64_vqabsq_s16(x: i16x8) -> i16x8;
fn aarch64_vqabsq_s32(x: i32x4) -> i32x4;
fn aarch64_vqabsq_s64(x: i64x2) -> i64x2;
fn aarch64_vqneg_s8(x: i8x8) -> i8x8;
fn aarch64_vqneg_s16(x: i16x4) -> i16x4;
fn aarch64_vqneg_s32(x: i32x2) -> i32x2;
fn aarch64_vqneg_s64(x: i64x1) -> i64x1;
fn aarch64_vqnegq_s8(x: i8x16) -> i8x16;
fn aarch64_vqnegq_s16(x: i16x8) -> i16x8;
fn aarch64_vqnegq_s32(x: i32x4) -> i32x4;
fn aarch64_vqnegq_s64(x: i64x2) -> i64x2;
fn aarch64_vclz_s8(x: i8x8) -> i8x8;
fn aarch64_vclz_u8(x: u8x8) -> u8x8;
fn aarch64_vclz_s16(x: i16x4) -> i16x4;
fn aarch64_vclz_u16(x: u16x4) -> u16x4;
fn aarch64_vclz_s32(x: i32x2) -> i32x2;
fn aarch64_vclz_u32(x: u32x2) -> u32x2;
fn aarch64_vclzq_s8(x: i8x16) -> i8x16;
fn aarch64_vclzq_u8(x: u8x16) -> u8x16;
fn aarch64_vclzq_s16(x: i16x8) -> i16x8;
fn aarch64_vclzq_u16(x: u16x8) -> u16x8;
fn aarch64_vclzq_s32(x: i32x4) -> i32x4;
fn aarch64_vclzq_u32(x: u32x4) -> u32x4;
fn aarch64_vcls_s8(x: i8x8) -> i8x8;
fn aarch64_vcls_u8(x: u8x8) -> u8x8;
fn aarch64_vcls_s16(x: i16x4) -> i16x4;
fn aarch64_vcls_u16(x: u16x4) -> u16x4;
fn aarch64_vcls_s32(x: i32x2) -> i32x2;
fn aarch64_vcls_u32(x: u32x2) -> u32x2;
fn aarch64_vclsq_s8(x: i8x16) -> i8x16;
fn aarch64_vclsq_u8(x: u8x16) -> u8x16;
fn aarch64_vclsq_s16(x: i16x8) -> i16x8;
fn aarch64_vclsq_u16(x: u16x8) -> u16x8;
fn aarch64_vclsq_s32(x: i32x4) -> i32x4;
fn aarch64_vclsq_u32(x: u32x4) -> u32x4;
fn aarch64_vcnt_s8(x: i8x8) -> i8x8;
fn aarch64_vcnt_u8(x: u8x8) -> u8x8;
fn aarch64_vcntq_s8(x: i8x16) -> i8x16;
fn aarch64_vcntq_u8(x: u8x16) -> u8x16;
fn aarch64_vrecpe_u32(x: u32x2) -> u32x2;
fn aarch64_vrecpe_f32(x: f32x2) -> f32x2;
fn aarch64_vrecpe_f64(x: f64x1) -> f64x1;
fn aarch64_vrecpeq_u32(x: u32x4) -> u32x4;
fn aarch64_vrecpeq_f32(x: f32x4) -> f32x4;
fn aarch64_vrecpeq_f64(x: f64x2) -> f64x2;
fn aarch64_vrecps_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vrecps_f64(x: f64x1, y: f64x1) -> f64x1;
fn aarch64_vrecpsq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vrecpsq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vsqrt_f32(x: f32x2) -> f32x2;
fn aarch64_vsqrt_f64(x: f64x1) -> f64x1;
fn aarch64_vsqrtq_f32(x: f32x4) -> f32x4;
fn aarch64_vsqrtq_f64(x: f64x2) -> f64x2;
fn aarch64_vrsqrte_u32(x: u32x2) -> u32x2;
fn aarch64_vrsqrte_f32(x: f32x2) -> f32x2;
fn aarch64_vrsqrte_f64(x: f64x1) -> f64x1;
fn aarch64_vrsqrteq_u32(x: u32x4) -> u32x4;
fn aarch64_vrsqrteq_f32(x: f32x4) -> f32x4;
fn aarch64_vrsqrteq_f64(x: f64x2) -> f64x2;
fn aarch64_vrsqrts_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vrsqrts_f64(x: f64x1, y: f64x1) -> f64x1;
fn aarch64_vrsqrtsq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vrsqrtsq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vrbit_s8(x: i8x8) -> i8x8;
fn aarch64_vrbit_u8(x: u8x8) -> u8x8;
fn aarch64_vrbitq_s8(x: i8x16) -> i8x16;
fn aarch64_vrbitq_u8(x: u8x16) -> u8x16;
fn aarch64_vpadd_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vpadd_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vpadd_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vpadd_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vpadd_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vpadd_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vpadd_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vpaddq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vpaddq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vpaddq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vpaddq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vpaddq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vpaddq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vpaddq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vpaddq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vpaddq_u64(x: u64x2, y: u64x2) -> u64x2;
fn aarch64_vpaddq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vpaddl_s16(x: i8x8) -> i16x4;
fn aarch64_vpaddl_u16(x: u8x8) -> u16x4;
fn aarch64_vpaddl_s32(x: i16x4) -> i32x2;
fn aarch64_vpaddl_u32(x: u16x4) -> u32x2;
fn aarch64_vpaddl_s64(x: i32x2) -> i64x1;
fn aarch64_vpaddl_u64(x: u32x2) -> u64x1;
fn aarch64_vpaddlq_s16(x: i8x16) -> i16x8;
fn aarch64_vpaddlq_u16(x: u8x16) -> u16x8;
fn aarch64_vpaddlq_s32(x: i16x8) -> i32x4;
fn aarch64_vpaddlq_u32(x: u16x8) -> u32x4;
fn aarch64_vpaddlq_s64(x: i32x4) -> i64x2;
fn aarch64_vpaddlq_u64(x: u32x4) -> u64x2;
fn aarch64_vpmax_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vpmax_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vpmax_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vpmax_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vpmax_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vpmax_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vpmax_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vpmaxq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vpmaxq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vpmaxq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vpmaxq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vpmaxq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vpmaxq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vpmaxq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vpmaxq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vpmaxq_u64(x: u64x2, y: u64x2) -> u64x2;
fn aarch64_vpmaxq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vpmin_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vpmin_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vpmin_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vpmin_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vpmin_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vpmin_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vpmin_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vpminq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vpminq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vpminq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vpminq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vpminq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vpminq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vpminq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vpminq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vpminq_u64(x: u64x2, y: u64x2) -> u64x2;
fn aarch64_vpminq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vpmaxnm_s8(x: i8x8, y: i8x8) -> i8x8;
fn aarch64_vpmaxnm_u8(x: u8x8, y: u8x8) -> u8x8;
fn aarch64_vpmaxnm_s16(x: i16x4, y: i16x4) -> i16x4;
fn aarch64_vpmaxnm_u16(x: u16x4, y: u16x4) -> u16x4;
fn aarch64_vpmaxnm_s32(x: i32x2, y: i32x2) -> i32x2;
fn aarch64_vpmaxnm_u32(x: u32x2, y: u32x2) -> u32x2;
fn aarch64_vpmaxnm_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vpmaxnmq_s8(x: i8x16, y: i8x16) -> i8x16;
fn aarch64_vpmaxnmq_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vpmaxnmq_s16(x: i16x8, y: i16x8) -> i16x8;
fn aarch64_vpmaxnmq_u16(x: u16x8, y: u16x8) -> u16x8;
fn aarch64_vpmaxnmq_s32(x: i32x4, y: i32x4) -> i32x4;
fn aarch64_vpmaxnmq_u32(x: u32x4, y: u32x4) -> u32x4;
fn aarch64_vpmaxnmq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vpmaxnmq_s64(x: i64x2, y: i64x2) -> i64x2;
fn aarch64_vpmaxnmq_u64(x: u64x2, y: u64x2) -> u64x2;
fn aarch64_vpmaxnmq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vpminnm_f32(x: f32x2, y: f32x2) -> f32x2;
fn aarch64_vpminnmq_f32(x: f32x4, y: f32x4) -> f32x4;
fn aarch64_vpminnmq_f64(x: f64x2, y: f64x2) -> f64x2;
fn aarch64_vaddv_s8(x: i8x8) -> i8;
fn aarch64_vaddv_u8(x: u8x8) -> u8;
fn aarch64_vaddv_s16(x: i16x4) -> i16;
fn aarch64_vaddv_u16(x: u16x4) -> u16;
fn aarch64_vaddv_s32(x: i32x2) -> i32;
fn aarch64_vaddv_u32(x: u32x2) -> u32;
fn aarch64_vaddv_f32(x: f32x2) -> f32;
fn aarch64_vaddvq_s8(x: i8x16) -> i8;
fn aarch64_vaddvq_u8(x: u8x16) -> u8;
fn aarch64_vaddvq_s16(x: i16x8) -> i16;
fn aarch64_vaddvq_u16(x: u16x8) -> u16;
fn aarch64_vaddvq_s32(x: i32x4) -> i32;
fn aarch64_vaddvq_u32(x: u32x4) -> u32;
fn aarch64_vaddvq_f32(x: f32x4) -> f32;
fn aarch64_vaddvq_s64(x: i64x2) -> i64;
fn aarch64_vaddvq_u64(x: u64x2) -> u64;
fn aarch64_vaddvq_f64(x: f64x2) -> f64;
fn aarch64_vaddlv_s8(x: i8x8) -> i16;
fn aarch64_vaddlv_u8(x: u8x8) -> u16;
fn aarch64_vaddlv_s16(x: i16x4) -> i32;
fn aarch64_vaddlv_u16(x: u16x4) -> u32;
fn aarch64_vaddlv_s32(x: i32x2) -> i64;
fn aarch64_vaddlv_u32(x: u32x2) -> u64;
fn aarch64_vaddlvq_s8(x: i8x16) -> i16;
fn aarch64_vaddlvq_u8(x: u8x16) -> u16;
fn aarch64_vaddlvq_s16(x: i16x8) -> i32;
fn aarch64_vaddlvq_u16(x: u16x8) -> u32;
fn aarch64_vaddlvq_s32(x: i32x4) -> i64;
fn aarch64_vaddlvq_u32(x: u32x4) -> u64;
fn aarch64_vmaxv_s8(x: i8x8) -> i8;
fn aarch64_vmaxv_u8(x: u8x8) -> u8;
fn aarch64_vmaxv_s16(x: i16x4) -> i16;
fn aarch64_vmaxv_u16(x: u16x4) -> u16;
fn aarch64_vmaxv_s32(x: i32x2) -> i32;
fn aarch64_vmaxv_u32(x: u32x2) -> u32;
fn aarch64_vmaxv_f32(x: f32x2) -> f32;
fn aarch64_vmaxvq_s8(x: i8x16) -> i8;
fn aarch64_vmaxvq_u8(x: u8x16) -> u8;
fn aarch64_vmaxvq_s16(x: i16x8) -> i16;
fn aarch64_vmaxvq_u16(x: u16x8) -> u16;
fn aarch64_vmaxvq_s32(x: i32x4) -> i32;
fn aarch64_vmaxvq_u32(x: u32x4) -> u32;
fn aarch64_vmaxvq_f32(x: f32x4) -> f32;
fn aarch64_vmaxvq_f64(x: f64x2) -> f64;
fn aarch64_vminv_s8(x: i8x8) -> i8;
fn aarch64_vminv_u8(x: u8x8) -> u8;
fn aarch64_vminv_s16(x: i16x4) -> i16;
fn aarch64_vminv_u16(x: u16x4) -> u16;
fn aarch64_vminv_s32(x: i32x2) -> i32;
fn aarch64_vminv_u32(x: u32x2) -> u32;
fn aarch64_vminv_f32(x: f32x2) -> f32;
fn aarch64_vminvq_s8(x: i8x16) -> i8;
fn aarch64_vminvq_u8(x: u8x16) -> u8;
fn aarch64_vminvq_s16(x: i16x8) -> i16;
fn aarch64_vminvq_u16(x: u16x8) -> u16;
fn aarch64_vminvq_s32(x: i32x4) -> i32;
fn aarch64_vminvq_u32(x: u32x4) -> u32;
fn aarch64_vminvq_f32(x: f32x4) -> f32;
fn aarch64_vminvq_f64(x: f64x2) -> f64;
fn aarch64_vmaxnmv_f32(x: f32x2) -> f32;
fn aarch64_vmaxnmvq_f32(x: f32x4) -> f32;
fn aarch64_vmaxnmvq_f64(x: f64x2) -> f64;
fn aarch64_vminnmv_f32(x: f32x2) -> f32;
fn aarch64_vminnmvq_f32(x: f32x4) -> f32;
fn aarch64_vminnmvq_f64(x: f64x2) -> f64;
fn aarch64_vqtbl1_s8(x: i8x16, y: u8x8) -> i8x8;
fn aarch64_vqtbl1_u8(x: u8x16, y: u8x8) -> u8x8;
fn aarch64_vqtbl1q_s8(x: i8x16, y: u8x16) -> i8x16;
fn aarch64_vqtbl1q_u8(x: u8x16, y: u8x16) -> u8x16;
fn aarch64_vqtbx1_s8(x: i8x8, y: i8x16, z: u8x8) -> i8x8;
fn aarch64_vqtbx1_u8(x: u8x8, y: u8x16, z: u8x8) -> u8x8;
fn aarch64_vqtbx1q_s8(x: i8x16, y: i8x16, z: u8x16) -> i8x16;
fn aarch64_vqtbx1q_u8(x: u8x16, y: u8x16, z: u8x16) -> u8x16;
fn aarch64_vqtbl2_s8(x: (i8x16, i8x16), y: u8x8) -> i8x8;
fn aarch64_vqtbl2_u8(x: (u8x16, u8x16), y: u8x8) -> u8x8;
fn aarch64_vqtbl2q_s8(x: (i8x16, i8x16), y: u8x16) -> i8x16;
fn aarch64_vqtbl2q_u8(x: (u8x16, u8x16), y: u8x16) -> u8x16;
fn aarch64_vqtbx2_s8(x: (i8x16, i8x16), y: u8x8) -> i8x8;
fn aarch64_vqtbx2_u8(x: (u8x16, u8x16), y: u8x8) -> u8x8;
fn aarch64_vqtbx2q_s8(x: (i8x16, i8x16), y: u8x16) -> i8x16;
fn aarch64_vqtbx2q_u8(x: (u8x16, u8x16), y: u8x16) -> u8x16;
fn aarch64_vqtbl3_s8(x: (i8x16, i8x16, i8x16), y: u8x8) -> i8x8;
fn aarch64_vqtbl3_u8(x: (u8x16, u8x16, u8x16), y: u8x8) -> u8x8;
fn aarch64_vqtbl3q_s8(x: (i8x16, i8x16, i8x16), y: u8x16) -> i8x16;
fn aarch64_vqtbl3q_u8(x: (u8x16, u8x16, u8x16), y: u8x16) -> u8x16;
fn aarch64_vqtbx3_s8(x: i8x8, y: (i8x16, i8x16, i8x16), z: u8x8) -> i8x8;
fn aarch64_vqtbx3_u8(x: u8x8, y: (u8x16, u8x16, u8x16), z: u8x8) -> u8x8;
fn aarch64_vqtbx3q_s8(x: i8x16, y: (i8x16, i8x16, i8x16), z: u8x16) -> i8x16;
fn aarch64_vqtbx3q_u8(x: u8x16, y: (u8x16, u8x16, u8x16), z: u8x16) -> u8x16;
fn aarch64_vqtbl4_s8(x: (i8x16, i8x16, i8x16, i8x16), y: u8x8) -> i8x8;
fn aarch64_vqtbl4_u8(x: (u8x16, u8x16, u8x16, u8x16), y: u8x8) -> u8x8;
fn aarch64_vqtbl4q_s8(x: (i8x16, i8x16, i8x16, i8x16), y: u8x16) -> i8x16;
fn aarch64_vqtbl4q_u8(x: (u8x16, u8x16, u8x16, u8x16), y: u8x16) -> u8x16;
fn aarch64_vqtbx4_s8(x: i8x8, y: (i8x16, i8x16, i8x16, i8x16), z: u8x8) -> i8x8;
fn aarch64_vqtbx4_u8(x: u8x8, y: (u8x16, u8x16, u8x16, u8x16), z: u8x8) -> u8x8;
fn aarch64_vqtbx4q_s8(x: i8x16, y: (i8x16, i8x16, i8x16, i8x16), z: u8x16) -> i8x16;
fn aarch64_vqtbx4q_u8(x: u8x16, y: (u8x16, u8x16, u8x16, u8x16), z: u8x16) -> u8x16;
}
pub trait Aarch64F32x4 {
fn to_f64(self) -> f64x2;
}
impl Aarch64F32x4 for f32x4 {
#[inline]
fn to_f64(self) -> f64x2 {
unsafe {
simd_cast(f32x2(self.0, self.1))
}
}
}
pub trait Aarch64U8x16 {
fn table_lookup_1(self, t0: u8x16) -> u8x16;
}
impl Aarch64U8x16 for u8x16 {
#[inline]
fn table_lookup_1(self, t0: u8x16) -> u8x16 {
unsafe {aarch64_vqtbl1q_u8(t0, self)}
}
}
pub trait Aarch64I8x16 {
fn table_lookup_1(self, t0: i8x16) -> i8x16;
}
impl Aarch64I8x16 for i8x16 {
#[inline]
fn table_lookup_1(self, t0: i8x16) -> i8x16 {
unsafe {aarch64_vqtbl2q_s8((t0, t0), ::bitcast(self))}
}
}
#[doc(hidden)]
pub mod common {
use super::super::super::*;
use std::mem;
#[inline]
pub fn f32x4_sqrt(x: f32x4) -> f32x4 {
unsafe {super::aarch64_vsqrtq_f32(x)}
}
#[inline]
pub fn f32x4_approx_rsqrt(x: f32x4) -> f32x4 {
unsafe {super::aarch64_vrsqrteq_f32(x)}
}
#[inline]
pub fn f32x4_approx_reciprocal(x: f32x4) -> f32x4 {
unsafe {super::aarch64_vrecpeq_f32(x)}
}
#[inline]
pub fn f32x4_max(x: f32x4, y: f32x4) -> f32x4 {
unsafe {super::aarch64_vmaxq_f32(x, y)}
}
#[inline]
pub fn f32x4_min(x: f32x4, y: f32x4) -> f32x4 {
unsafe {super::aarch64_vminq_f32(x, y)}
}
macro_rules! bools {
($($ty: ty, $all: ident ($min: ident), $any: ident ($max: ident);)*) => {
$(
#[inline]
pub fn $all(x: $ty) -> bool {
unsafe {
super::$min(mem::transmute(x)) != 0
}
}
#[inline]
pub fn $any(x: $ty) -> bool {
unsafe {
super::$max(mem::transmute(x)) != 0
}
}
)*
}
}
bools! {
bool32fx4, bool32fx4_all(aarch64_vminvq_u32), bool32fx4_any(aarch64_vmaxvq_u32);
bool8ix16, bool8ix16_all(aarch64_vminvq_u8), bool8ix16_any(aarch64_vmaxvq_u8);
bool16ix8, bool16ix8_all(aarch64_vminvq_u16), bool16ix8_any(aarch64_vmaxvq_u16);
bool32ix4, bool32ix4_all(aarch64_vminvq_u32), bool32ix4_any(aarch64_vmaxvq_u32);
}
}

4
third_party/rust/simd/src/arm/mod.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,4 @@
//! Features specific to ARM CPUs.
#[cfg(any(feature = "doc", target_feature = "neon"))]
pub mod neon;

530
third_party/rust/simd/src/arm/neon.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,530 @@
use super::super::*;
use sixty_four::{i64x2, u64x2};
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct u32x2(u32, u32);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct i32x2(i32, i32);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct f32x2(f32, f32);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct u16x4(u16, u16, u16, u16);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct i16x4(i16, i16, i16, i16);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct u8x8(u8, u8, u8, u8,
u8, u8, u8, u8);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct i8x8(i8, i8, i8, i8,
i8, i8, i8, i8);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct i64x1(i64);
#[repr(simd)]
#[derive(Copy, Clone)]
pub struct u64x1(u64);
#[allow(dead_code)]
extern "platform-intrinsic" {
fn arm_vhadd_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vhadd_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vhadd_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vhadd_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vhadd_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vhadd_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vrhadd_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vrhadd_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vrhadd_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vrhadd_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vrhadd_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vrhadd_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vrhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vrhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vrhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vrhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vrhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vrhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vqadd_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vqadd_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vqadd_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vqadd_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vqadd_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vqadd_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vqadd_s64(x: i64x1, y: i64x1) -> i64x1;
fn arm_vqadd_u64(x: u64x1, y: u64x1) -> u64x1;
fn arm_vqaddq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vqaddq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vqaddq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vqaddq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vqaddq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vqaddq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vqaddq_s64(x: i64x2, y: i64x2) -> i64x2;
fn arm_vqaddq_u64(x: u64x2, y: u64x2) -> u64x2;
fn arm_vraddhn_s16(x: i16x8, y: i16x8) -> i8x8;
fn arm_vraddhn_u16(x: u16x8, y: u16x8) -> u8x8;
fn arm_vraddhn_s32(x: i32x4, y: i32x4) -> i16x4;
fn arm_vraddhn_u32(x: u32x4, y: u32x4) -> u16x4;
fn arm_vraddhn_s64(x: i64x2, y: i64x2) -> i32x2;
fn arm_vraddhn_u64(x: u64x2, y: u64x2) -> u32x2;
fn arm_vfma_f32(x: f32x2, y: f32x2) -> f32x2;
fn arm_vfmaq_f32(x: f32x4, y: f32x4) -> f32x4;
fn arm_vqdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vqdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vqdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vqdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vqrdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vqrdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vqrdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vqrdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vmull_s8(x: i8x8, y: i8x8) -> i16x8;
fn arm_vmull_u8(x: u8x8, y: u8x8) -> u16x8;
fn arm_vmull_s16(x: i16x4, y: i16x4) -> i32x4;
fn arm_vmull_u16(x: u16x4, y: u16x4) -> u32x4;
fn arm_vmull_s32(x: i32x2, y: i32x2) -> i64x2;
fn arm_vmull_u32(x: u32x2, y: u32x2) -> u64x2;
fn arm_vqdmullq_s8(x: i8x8, y: i8x8) -> i16x8;
fn arm_vqdmullq_s16(x: i16x4, y: i16x4) -> i32x4;
fn arm_vhsub_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vhsub_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vhsub_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vhsub_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vhsub_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vhsub_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vhsubq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vhsubq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vhsubq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vhsubq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vhsubq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vhsubq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vqsub_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vqsub_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vqsub_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vqsub_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vqsub_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vqsub_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vqsub_s64(x: i64x1, y: i64x1) -> i64x1;
fn arm_vqsub_u64(x: u64x1, y: u64x1) -> u64x1;
fn arm_vqsubq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vqsubq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vqsubq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vqsubq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vqsubq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vqsubq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vqsubq_s64(x: i64x2, y: i64x2) -> i64x2;
fn arm_vqsubq_u64(x: u64x2, y: u64x2) -> u64x2;
fn arm_vrsubhn_s16(x: i16x8, y: i16x8) -> i8x8;
fn arm_vrsubhn_u16(x: u16x8, y: u16x8) -> u8x8;
fn arm_vrsubhn_s32(x: i32x4, y: i32x4) -> i16x4;
fn arm_vrsubhn_u32(x: u32x4, y: u32x4) -> u16x4;
fn arm_vrsubhn_s64(x: i64x2, y: i64x2) -> i32x2;
fn arm_vrsubhn_u64(x: u64x2, y: u64x2) -> u32x2;
fn arm_vabd_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vabd_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vabd_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vabd_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vabd_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vabd_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vabd_f32(x: f32x2, y: f32x2) -> f32x2;
fn arm_vabdq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vabdq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vabdq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vabdq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vabdq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vabdq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vabdq_f32(x: f32x4, y: f32x4) -> f32x4;
fn arm_vmax_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vmax_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vmax_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vmax_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vmax_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vmax_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vmax_f32(x: f32x2, y: f32x2) -> f32x2;
fn arm_vmaxq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vmaxq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vmaxq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vmaxq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vmaxq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vmaxq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vmaxq_f32(x: f32x4, y: f32x4) -> f32x4;
fn arm_vmin_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vmin_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vmin_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vmin_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vmin_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vmin_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vmin_f32(x: f32x2, y: f32x2) -> f32x2;
fn arm_vminq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vminq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vminq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vminq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vminq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vminq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vminq_f32(x: f32x4, y: f32x4) -> f32x4;
fn arm_vshl_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vshl_u8(x: u8x8, y: i8x8) -> u8x8;
fn arm_vshl_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vshl_u16(x: u16x4, y: i16x4) -> u16x4;
fn arm_vshl_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vshl_u32(x: u32x2, y: i32x2) -> u32x2;
fn arm_vshl_s64(x: i64x1, y: i64x1) -> i64x1;
fn arm_vshl_u64(x: u64x1, y: i64x1) -> u64x1;
fn arm_vshlq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vshlq_u8(x: u8x16, y: i8x16) -> u8x16;
fn arm_vshlq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vshlq_u16(x: u16x8, y: i16x8) -> u16x8;
fn arm_vshlq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vshlq_u32(x: u32x4, y: i32x4) -> u32x4;
fn arm_vshlq_s64(x: i64x2, y: i64x2) -> i64x2;
fn arm_vshlq_u64(x: u64x2, y: i64x2) -> u64x2;
fn arm_vqshl_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vqshl_u8(x: u8x8, y: i8x8) -> u8x8;
fn arm_vqshl_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vqshl_u16(x: u16x4, y: i16x4) -> u16x4;
fn arm_vqshl_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vqshl_u32(x: u32x2, y: i32x2) -> u32x2;
fn arm_vqshl_s64(x: i64x1, y: i64x1) -> i64x1;
fn arm_vqshl_u64(x: u64x1, y: i64x1) -> u64x1;
fn arm_vqshlq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vqshlq_u8(x: u8x16, y: i8x16) -> u8x16;
fn arm_vqshlq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vqshlq_u16(x: u16x8, y: i16x8) -> u16x8;
fn arm_vqshlq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vqshlq_u32(x: u32x4, y: i32x4) -> u32x4;
fn arm_vqshlq_s64(x: i64x2, y: i64x2) -> i64x2;
fn arm_vqshlq_u64(x: u64x2, y: i64x2) -> u64x2;
fn arm_vrshl_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vrshl_u8(x: u8x8, y: i8x8) -> u8x8;
fn arm_vrshl_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vrshl_u16(x: u16x4, y: i16x4) -> u16x4;
fn arm_vrshl_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vrshl_u32(x: u32x2, y: i32x2) -> u32x2;
fn arm_vrshl_s64(x: i64x1, y: i64x1) -> i64x1;
fn arm_vrshl_u64(x: u64x1, y: i64x1) -> u64x1;
fn arm_vrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
fn arm_vrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
fn arm_vrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
fn arm_vrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
fn arm_vrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
fn arm_vqrshl_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vqrshl_u8(x: u8x8, y: i8x8) -> u8x8;
fn arm_vqrshl_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vqrshl_u16(x: u16x4, y: i16x4) -> u16x4;
fn arm_vqrshl_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vqrshl_u32(x: u32x2, y: i32x2) -> u32x2;
fn arm_vqrshl_s64(x: i64x1, y: i64x1) -> i64x1;
fn arm_vqrshl_u64(x: u64x1, y: i64x1) -> u64x1;
fn arm_vqrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vqrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
fn arm_vqrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vqrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
fn arm_vqrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vqrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
fn arm_vqrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
fn arm_vqrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
fn arm_vqshrun_n_s16(x: i16x8, y: u32) -> i8x8;
fn arm_vqshrun_n_s32(x: i32x4, y: u32) -> i16x4;
fn arm_vqshrun_n_s64(x: i64x2, y: u32) -> i32x2;
fn arm_vqrshrun_n_s16(x: i16x8, y: u32) -> i8x8;
fn arm_vqrshrun_n_s32(x: i32x4, y: u32) -> i16x4;
fn arm_vqrshrun_n_s64(x: i64x2, y: u32) -> i32x2;
fn arm_vqshrn_n_s16(x: i16x8, y: u32) -> i8x8;
fn arm_vqshrn_n_u16(x: u16x8, y: u32) -> u8x8;
fn arm_vqshrn_n_s32(x: i32x4, y: u32) -> i16x4;
fn arm_vqshrn_n_u32(x: u32x4, y: u32) -> u16x4;
fn arm_vqshrn_n_s64(x: i64x2, y: u32) -> i32x2;
fn arm_vqshrn_n_u64(x: u64x2, y: u32) -> u32x2;
fn arm_vrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
fn arm_vrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
fn arm_vrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
fn arm_vrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
fn arm_vrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
fn arm_vrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
fn arm_vqrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
fn arm_vqrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
fn arm_vqrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
fn arm_vqrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
fn arm_vqrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
fn arm_vqrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
fn arm_vsri_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vsri_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vsri_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vsri_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vsri_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vsri_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vsri_s64(x: i64x1, y: i64x1) -> i64x1;
fn arm_vsri_u64(x: u64x1, y: u64x1) -> u64x1;
fn arm_vsriq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vsriq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vsriq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vsriq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vsriq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vsriq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vsriq_s64(x: i64x2, y: i64x2) -> i64x2;
fn arm_vsriq_u64(x: u64x2, y: u64x2) -> u64x2;
fn arm_vsli_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vsli_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vsli_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vsli_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vsli_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vsli_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vsli_s64(x: i64x1, y: i64x1) -> i64x1;
fn arm_vsli_u64(x: u64x1, y: u64x1) -> u64x1;
fn arm_vsliq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vsliq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vsliq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vsliq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vsliq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vsliq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vsliq_s64(x: i64x2, y: i64x2) -> i64x2;
fn arm_vsliq_u64(x: u64x2, y: u64x2) -> u64x2;
fn arm_vvqmovn_s16(x: i16x8) -> i8x8;
fn arm_vvqmovn_u16(x: u16x8) -> u8x8;
fn arm_vvqmovn_s32(x: i32x4) -> i16x4;
fn arm_vvqmovn_u32(x: u32x4) -> u16x4;
fn arm_vvqmovn_s64(x: i64x2) -> i32x2;
fn arm_vvqmovn_u64(x: u64x2) -> u32x2;
fn arm_vabs_s8(x: i8x8) -> i8x8;
fn arm_vabs_s16(x: i16x4) -> i16x4;
fn arm_vabs_s32(x: i32x2) -> i32x2;
fn arm_vabsq_s8(x: i8x16) -> i8x16;
fn arm_vabsq_s16(x: i16x8) -> i16x8;
fn arm_vabsq_s32(x: i32x4) -> i32x4;
fn arm_vabs_f32(x: f32x2) -> f32x2;
fn arm_vabsq_f32(x: f32x4) -> f32x4;
fn arm_vqabs_s8(x: i8x8) -> i8x8;
fn arm_vqabs_s16(x: i16x4) -> i16x4;
fn arm_vqabs_s32(x: i32x2) -> i32x2;
fn arm_vqabsq_s8(x: i8x16) -> i8x16;
fn arm_vqabsq_s16(x: i16x8) -> i16x8;
fn arm_vqabsq_s32(x: i32x4) -> i32x4;
fn arm_vqneg_s8(x: i8x8) -> i8x8;
fn arm_vqneg_s16(x: i16x4) -> i16x4;
fn arm_vqneg_s32(x: i32x2) -> i32x2;
fn arm_vqnegq_s8(x: i8x16) -> i8x16;
fn arm_vqnegq_s16(x: i16x8) -> i16x8;
fn arm_vqnegq_s32(x: i32x4) -> i32x4;
fn arm_vclz_s8(x: i8x8) -> i8x8;
fn arm_vclz_u8(x: u8x8) -> u8x8;
fn arm_vclz_s16(x: i16x4) -> i16x4;
fn arm_vclz_u16(x: u16x4) -> u16x4;
fn arm_vclz_s32(x: i32x2) -> i32x2;
fn arm_vclz_u32(x: u32x2) -> u32x2;
fn arm_vclzq_s8(x: i8x16) -> i8x16;
fn arm_vclzq_u8(x: u8x16) -> u8x16;
fn arm_vclzq_s16(x: i16x8) -> i16x8;
fn arm_vclzq_u16(x: u16x8) -> u16x8;
fn arm_vclzq_s32(x: i32x4) -> i32x4;
fn arm_vclzq_u32(x: u32x4) -> u32x4;
fn arm_vcls_s8(x: i8x8) -> i8x8;
fn arm_vcls_u8(x: u8x8) -> u8x8;
fn arm_vcls_s16(x: i16x4) -> i16x4;
fn arm_vcls_u16(x: u16x4) -> u16x4;
fn arm_vcls_s32(x: i32x2) -> i32x2;
fn arm_vcls_u32(x: u32x2) -> u32x2;
fn arm_vclsq_s8(x: i8x16) -> i8x16;
fn arm_vclsq_u8(x: u8x16) -> u8x16;
fn arm_vclsq_s16(x: i16x8) -> i16x8;
fn arm_vclsq_u16(x: u16x8) -> u16x8;
fn arm_vclsq_s32(x: i32x4) -> i32x4;
fn arm_vclsq_u32(x: u32x4) -> u32x4;
fn arm_vcnt_s8(x: i8x8) -> i8x8;
fn arm_vcnt_u8(x: u8x8) -> u8x8;
fn arm_vcntq_s8(x: i8x16) -> i8x16;
fn arm_vcntq_u8(x: u8x16) -> u8x16;
fn arm_vrecpe_u32(x: u32x2) -> u32x2;
fn arm_vrecpe_f32(x: f32x2) -> f32x2;
fn arm_vrecpeq_u32(x: u32x4) -> u32x4;
fn arm_vrecpeq_f32(x: f32x4) -> f32x4;
fn arm_vrecps_f32(x: f32x2, y: f32x2) -> f32x2;
fn arm_vrecpsq_f32(x: f32x4, y: f32x4) -> f32x4;
fn arm_vsqrt_f32(x: f32x2) -> f32x2;
fn arm_vsqrtq_f32(x: f32x4) -> f32x4;
fn arm_vrsqrte_u32(x: u32x2) -> u32x2;
fn arm_vrsqrte_f32(x: f32x2) -> f32x2;
fn arm_vrsqrteq_u32(x: u32x4) -> u32x4;
fn arm_vrsqrteq_f32(x: f32x4) -> f32x4;
fn arm_vrsqrts_f32(x: f32x2, y: f32x2) -> f32x2;
fn arm_vrsqrtsq_f32(x: f32x4, y: f32x4) -> f32x4;
fn arm_vbsl_s8(x: u8x8, y: i8x8) -> i8x8;
fn arm_vbsl_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vbsl_s16(x: u16x4, y: i16x4) -> i16x4;
fn arm_vbsl_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vbsl_s32(x: u32x2, y: i32x2) -> i32x2;
fn arm_vbsl_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vbsl_s64(x: u64x1, y: i64x1) -> i64x1;
fn arm_vbsl_u64(x: u64x1, y: u64x1) -> u64x1;
fn arm_vbslq_s8(x: u8x16, y: i8x16) -> i8x16;
fn arm_vbslq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vbslq_s16(x: u16x8, y: i16x8) -> i16x8;
fn arm_vbslq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vbslq_s32(x: u32x4, y: i32x4) -> i32x4;
fn arm_vbslq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vbslq_s64(x: u64x2, y: i64x2) -> i64x2;
fn arm_vbslq_u64(x: u64x2, y: u64x2) -> u64x2;
fn arm_vpadd_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vpadd_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vpadd_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vpadd_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vpadd_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vpadd_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vpadd_f32(x: f32x2, y: f32x2) -> f32x2;
fn arm_vpaddl_s16(x: i8x8) -> i16x4;
fn arm_vpaddl_u16(x: u8x8) -> u16x4;
fn arm_vpaddl_s32(x: i16x4) -> i32x2;
fn arm_vpaddl_u32(x: u16x4) -> u32x2;
fn arm_vpaddl_s64(x: i32x2) -> i64x1;
fn arm_vpaddl_u64(x: u32x2) -> u64x1;
fn arm_vpaddlq_s16(x: i8x16) -> i16x8;
fn arm_vpaddlq_u16(x: u8x16) -> u16x8;
fn arm_vpaddlq_s32(x: i16x8) -> i32x4;
fn arm_vpaddlq_u32(x: u16x8) -> u32x4;
fn arm_vpaddlq_s64(x: i32x4) -> i64x2;
fn arm_vpaddlq_u64(x: u32x4) -> u64x2;
fn arm_vpadal_s16(x: i16x4, y: i8x8) -> i16x4;
fn arm_vpadal_u16(x: u16x4, y: u8x8) -> u16x4;
fn arm_vpadal_s32(x: i32x2, y: i16x4) -> i32x2;
fn arm_vpadal_u32(x: u32x2, y: u16x4) -> u32x2;
fn arm_vpadal_s64(x: i64x1, y: i32x2) -> i64x1;
fn arm_vpadal_u64(x: u64x1, y: u32x2) -> u64x1;
fn arm_vpadalq_s16(x: i16x8, y: i8x16) -> i16x8;
fn arm_vpadalq_u16(x: u16x8, y: u8x16) -> u16x8;
fn arm_vpadalq_s32(x: i32x4, y: i16x8) -> i32x4;
fn arm_vpadalq_u32(x: u32x4, y: u16x8) -> u32x4;
fn arm_vpadalq_s64(x: i64x2, y: i32x4) -> i64x2;
fn arm_vpadalq_u64(x: u64x2, y: u32x4) -> u64x2;
fn arm_vpmax_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vpmax_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vpmax_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vpmax_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vpmax_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vpmax_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vpmax_f32(x: f32x2, y: f32x2) -> f32x2;
fn arm_vpmin_s8(x: i8x8, y: i8x8) -> i8x8;
fn arm_vpmin_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vpmin_s16(x: i16x4, y: i16x4) -> i16x4;
fn arm_vpmin_u16(x: u16x4, y: u16x4) -> u16x4;
fn arm_vpmin_s32(x: i32x2, y: i32x2) -> i32x2;
fn arm_vpmin_u32(x: u32x2, y: u32x2) -> u32x2;
fn arm_vpmin_f32(x: f32x2, y: f32x2) -> f32x2;
fn arm_vpminq_s8(x: i8x16, y: i8x16) -> i8x16;
fn arm_vpminq_u8(x: u8x16, y: u8x16) -> u8x16;
fn arm_vpminq_s16(x: i16x8, y: i16x8) -> i16x8;
fn arm_vpminq_u16(x: u16x8, y: u16x8) -> u16x8;
fn arm_vpminq_s32(x: i32x4, y: i32x4) -> i32x4;
fn arm_vpminq_u32(x: u32x4, y: u32x4) -> u32x4;
fn arm_vpminq_f32(x: f32x4, y: f32x4) -> f32x4;
fn arm_vtbl1_s8(x: i8x8, y: u8x8) -> i8x8;
fn arm_vtbl1_u8(x: u8x8, y: u8x8) -> u8x8;
fn arm_vtbx1_s8(x: i8x8, y: i8x8, z: u8x8) -> i8x8;
fn arm_vtbx1_u8(x: u8x8, y: u8x8, z: u8x8) -> u8x8;
fn arm_vtbl2_s8(x: (i8x8, i8x8), y: u8x8) -> i8x8;
fn arm_vtbl2_u8(x: (u8x8, u8x8), y: u8x8) -> u8x8;
fn arm_vtbx2_s8(x: (i8x8, i8x8), y: u8x8) -> i8x8;
fn arm_vtbx2_u8(x: (u8x8, u8x8), y: u8x8) -> u8x8;
fn arm_vtbl3_s8(x: (i8x8, i8x8, i8x8), y: u8x8) -> i8x8;
fn arm_vtbl3_u8(x: (u8x8, u8x8, u8x8), y: u8x8) -> u8x8;
fn arm_vtbx3_s8(x: i8x8, y: (i8x8, i8x8, i8x8), z: u8x8) -> i8x8;
fn arm_vtbx3_u8(x: u8x8, y: (u8x8, u8x8, u8x8), z: u8x8) -> u8x8;
fn arm_vtbl4_s8(x: (i8x8, i8x8, i8x8, i8x8), y: u8x8) -> i8x8;
fn arm_vtbl4_u8(x: (u8x8, u8x8, u8x8, u8x8), y: u8x8) -> u8x8;
fn arm_vtbx4_s8(x: i8x8, y: (i8x8, i8x8, i8x8, i8x8), z: u8x8) -> i8x8;
fn arm_vtbx4_u8(x: u8x8, y: (u8x8, u8x8, u8x8, u8x8), z: u8x8) -> u8x8;
}
impl u8x8 {
#[inline]
pub fn table_lookup_1(self, t0: u8x8) -> u8x8 {
unsafe {arm_vtbl1_u8(t0, self)}
}
#[inline]
pub fn table_lookup_2(self, t0: u8x8, t1: u8x8) -> u8x8 {
unsafe {arm_vtbl2_u8((t0, t1), self)}
}
#[inline]
pub fn table_lookup_3(self, t0: u8x8, t1: u8x8, t2: u8x8) -> u8x8 {
unsafe {arm_vtbl3_u8((t0, t1, t2), self)}
}
#[inline]
pub fn table_lookup_4(self, t0: u8x8, t1: u8x8, t2: u8x8, t3: u8x8) -> u8x8 {
unsafe {arm_vtbl4_u8((t0, t1, t2, t3), self)}
}
}
#[doc(hidden)]
pub mod common {
use super::super::super::*;
use super::*;
use std::mem;
#[inline]
pub fn f32x4_sqrt(x: f32x4) -> f32x4 {
unsafe {super::arm_vsqrtq_f32(x)}
}
#[inline]
pub fn f32x4_approx_rsqrt(x: f32x4) -> f32x4 {
unsafe {super::arm_vrsqrteq_f32(x)}
}
#[inline]
pub fn f32x4_approx_reciprocal(x: f32x4) -> f32x4 {
unsafe {super::arm_vrecpeq_f32(x)}
}
#[inline]
pub fn f32x4_max(x: f32x4, y: f32x4) -> f32x4 {
unsafe {super::arm_vmaxq_f32(x, y)}
}
#[inline]
pub fn f32x4_min(x: f32x4, y: f32x4) -> f32x4 {
unsafe {super::arm_vminq_f32(x, y)}
}
macro_rules! bools {
($($ty: ty, $half: ty, $all: ident ($min: ident), $any: ident ($max: ident);)*) => {
$(
#[inline]
pub fn $all(x: $ty) -> bool {
unsafe {
let (lo, hi): ($half, $half) = mem::transmute(x);
let x = super::$min(lo, hi);
let y = super::$min(x, mem::uninitialized());
y.0 != 0
}
}
#[inline]
pub fn $any(x: $ty) -> bool {
unsafe {
let (lo, hi): ($half, $half) = mem::transmute(x);
let x = super::$max(lo, hi);
let y = super::$max(x, mem::uninitialized());
y.0 != 0
}
}
)*
}
}
bools! {
bool32fx4, arm::neon::u32x2, bool32fx4_all(arm_vpmin_u32), bool32fx4_any(arm_vpmax_u32);
bool8ix16, arm::neon::u8x8, bool8ix16_all(arm_vpmin_u8), bool8ix16_any(arm_vpmax_u8);
bool16ix8, arm::neon::u16x4, bool16ix8_all(arm_vpmin_u16), bool16ix8_any(arm_vpmax_u16);
bool32ix4, arm::neon::u32x2, bool32ix4_all(arm_vpmin_u32), bool32ix4_any(arm_vpmax_u32);
}
}

521
third_party/rust/simd/src/common.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,521 @@
use super::*;
#[allow(unused_imports)]
use super::{
simd_eq, simd_ne, simd_lt, simd_le, simd_gt, simd_ge,
simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
simd_insert, simd_extract,
simd_cast,
simd_add, simd_sub, simd_mul, simd_div, simd_shl, simd_shr, simd_and, simd_or, simd_xor,
Unalign, bitcast,
};
use std::mem;
use std::ops;
#[cfg(any(target_arch = "x86",
target_arch = "x86_64"))]
use x86::sse2::common;
#[cfg(any(target_arch = "arm"))]
use arm::neon::common;
#[cfg(any(target_arch = "aarch64"))]
use aarch64::neon::common;
macro_rules! basic_impls {
($(
$name: ident:
$elem: ident, $bool: ident, $shuffle: ident, $length: expr, $($first: ident),* | $($last: ident),*;
)*) => {
$(impl $name {
/// Create a new instance.
#[inline]
pub const fn new($($first: $elem),*, $($last: $elem),*) -> $name {
$name($($first),*, $($last),*)
}
/// Create a new instance where every lane has value `x`.
#[inline]
pub const fn splat(x: $elem) -> $name {
$name($({ #[allow(dead_code)] struct $first; x }),*,
$({ #[allow(dead_code)] struct $last; x }),*)
}
/// Compare for equality.
#[inline]
pub fn eq(self, other: Self) -> $bool {
unsafe {simd_eq(self, other)}
}
/// Compare for equality.
#[inline]
pub fn ne(self, other: Self) -> $bool {
unsafe {simd_ne(self, other)}
}
/// Compare for equality.
#[inline]
pub fn lt(self, other: Self) -> $bool {
unsafe {simd_lt(self, other)}
}
/// Compare for equality.
#[inline]
pub fn le(self, other: Self) -> $bool {
unsafe {simd_le(self, other)}
}
/// Compare for equality.
#[inline]
pub fn gt(self, other: Self) -> $bool {
unsafe {simd_gt(self, other)}
}
/// Compare for equality.
#[inline]
pub fn ge(self, other: Self) -> $bool {
unsafe {simd_ge(self, other)}
}
/// Extract the value of the `idx`th lane of `self`.
///
/// # Panics
///
/// `extract` will panic if `idx` is out of bounds.
#[inline]
pub fn extract(self, idx: u32) -> $elem {
assert!(idx < $length);
unsafe {simd_extract(self, idx)}
}
/// Return a new vector where the `idx`th lane is replaced
/// by `elem`.
///
/// # Panics
///
/// `replace` will panic if `idx` is out of bounds.
#[inline]
pub fn replace(self, idx: u32, elem: $elem) -> Self {
assert!(idx < $length);
unsafe {simd_insert(self, idx, elem)}
}
/// Load a new value from the `idx`th position of `array`.
///
/// This is equivalent to the following, but is possibly
/// more efficient:
///
/// ```rust,ignore
/// Self::new(array[idx], array[idx + 1], ...)
/// ```
///
/// # Panics
///
/// `load` will panic if `idx` is out of bounds in
/// `array`, or if `array[idx..]` is too short.
#[inline]
pub fn load(array: &[$elem], idx: usize) -> Self {
let data = &array[idx..idx + $length];
let loaded = unsafe {
*(data.as_ptr() as *const Unalign<Self>)
};
loaded.0
}
/// Store the elements of `self` to `array`, starting at
/// the `idx`th position.
///
/// This is equivalent to the following, but is possibly
/// more efficient:
///
/// ```rust,ignore
/// array[i] = self.extract(0);
/// array[i + 1] = self.extract(1);
/// // ...
/// ```
///
/// # Panics
///
/// `store` will panic if `idx` is out of bounds in
/// `array`, or if `array[idx...]` is too short.
#[inline]
pub fn store(self, array: &mut [$elem], idx: usize) {
let place = &mut array[idx..idx + $length];
unsafe {
*(place.as_mut_ptr() as *mut Unalign<Self>) = Unalign(self)
}
}
})*
}
}
basic_impls! {
u32x4: u32, bool32ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
i32x4: i32, bool32ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
f32x4: f32, bool32fx4, simd_shuffle4, 4, x0, x1 | x2, x3;
u16x8: u16, bool16ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
i16x8: i16, bool16ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
u8x16: u8, bool8ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
i8x16: i8, bool8ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
}
macro_rules! bool_impls {
($(
$name: ident:
$elem: ident, $repr: ident, $repr_elem: ident, $length: expr, $all: ident, $any: ident,
$($first: ident),* | $($last: ident),*
[$(#[$cvt_meta: meta] $cvt: ident -> $cvt_to: ident),*];
)*) => {
$(impl $name {
/// Convert to integer representation.
#[inline]
pub fn to_repr(self) -> $repr {
unsafe {mem::transmute(self)}
}
/// Convert from integer representation.
#[inline]
#[inline]
pub fn from_repr(x: $repr) -> Self {
unsafe {mem::transmute(x)}
}
/// Create a new instance.
#[inline]
pub fn new($($first: bool),*, $($last: bool),*) -> $name {
unsafe {
// negate everything together
simd_sub($name::splat(false),
$name($( ($first as $repr_elem) ),*,
$( ($last as $repr_elem) ),*))
}
}
/// Create a new instance where every lane has value `x`.
#[allow(unused_variables)]
#[inline]
pub fn splat(x: bool) -> $name {
let x = if x {!(0 as $repr_elem)} else {0};
$name($({ let $first = (); x}),*,
$({ let $last = (); x}),*)
}
/// Extract the value of the `idx`th lane of `self`.
///
/// # Panics
///
/// `extract` will panic if `idx` is out of bounds.
#[inline]
pub fn extract(self, idx: u32) -> bool {
assert!(idx < $length);
unsafe {simd_extract(self.to_repr(), idx) != 0}
}
/// Return a new vector where the `idx`th lane is replaced
/// by `elem`.
///
/// # Panics
///
/// `replace` will panic if `idx` is out of bounds.
#[inline]
pub fn replace(self, idx: u32, elem: bool) -> Self {
assert!(idx < $length);
let x = if elem {!(0 as $repr_elem)} else {0};
unsafe {Self::from_repr(simd_insert(self.to_repr(), idx, x))}
}
/// Select between elements of `then` and `else_`, based on
/// the corresponding element of `self`.
///
/// This is equivalent to the following, but is possibly
/// more efficient:
///
/// ```rust,ignore
/// T::new(if self.extract(0) { then.extract(0) } else { else_.extract(0) },
/// if self.extract(1) { then.extract(1) } else { else_.extract(1) },
/// ...)
/// ```
#[inline]
pub fn select<T: Simd<Bool = $name>>(self, then: T, else_: T) -> T {
let then: $repr = bitcast(then);
let else_: $repr = bitcast(else_);
bitcast((then & self.to_repr()) | (else_ & (!self).to_repr()))
}
/// Check if every element of `self` is true.
///
/// This is equivalent to the following, but is possibly
/// more efficient:
///
/// ```rust,ignore
/// self.extract(0) && self.extract(1) && ...
/// ```
#[inline]
pub fn all(self) -> bool {
common::$all(self)
}
/// Check if any element of `self` is true.
///
/// This is equivalent to the following, but is possibly
/// more efficient:
///
/// ```rust,ignore
/// self.extract(0) || self.extract(1) || ...
/// ```
#[inline]
pub fn any(self) -> bool {
common::$any(self)
}
$(
#[$cvt_meta]
#[inline]
pub fn $cvt(self) -> $cvt_to {
bitcast(self)
}
)*
}
impl ops::Not for $name {
type Output = Self;
#[inline]
fn not(self) -> Self {
Self::from_repr($repr::splat(!(0 as $repr_elem)) ^ self.to_repr())
}
}
)*
}
}
bool_impls! {
bool32ix4: bool32i, i32x4, i32, 4, bool32ix4_all, bool32ix4_any, x0, x1 | x2, x3
[/// Convert `self` to a boolean vector for interacting with floating point vectors.
to_f -> bool32fx4];
bool32fx4: bool32f, i32x4, i32, 4, bool32fx4_all, bool32fx4_any, x0, x1 | x2, x3
[/// Convert `self` to a boolean vector for interacting with integer vectors.
to_i -> bool32ix4];
bool16ix8: bool16i, i16x8, i16, 8, bool16ix8_all, bool16ix8_any, x0, x1, x2, x3 | x4, x5, x6, x7 [];
bool8ix16: bool8i, i8x16, i8, 16, bool8ix16_all, bool8ix16_any, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15 [];
}
impl u32x4 {
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i32(self) -> i32x4 {
unsafe {simd_cast(self)}
}
/// Convert each lane to a 32-bit float.
#[inline]
pub fn to_f32(self) -> f32x4 {
unsafe {simd_cast(self)}
}
}
impl i32x4 {
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u32(self) -> u32x4 {
unsafe {simd_cast(self)}
}
/// Convert each lane to a 32-bit float.
#[inline]
pub fn to_f32(self) -> f32x4 {
unsafe {simd_cast(self)}
}
}
impl f32x4 {
/// Compute the square root of each lane.
#[inline]
pub fn sqrt(self) -> Self {
common::f32x4_sqrt(self)
}
/// Compute an approximation to the reciprocal of the square root
/// of `self`, that is, `f32::splat(1.0) / self.sqrt()`.
///
/// The accuracy of this approximation is platform dependent.
#[inline]
pub fn approx_rsqrt(self) -> Self {
common::f32x4_approx_rsqrt(self)
}
/// Compute an approximation to the reciprocal of `self`, that is,
/// `f32::splat(1.0) / self`.
///
/// The accuracy of this approximation is platform dependent.
#[inline]
pub fn approx_reciprocal(self) -> Self {
common::f32x4_approx_reciprocal(self)
}
/// Compute the lane-wise maximum of `self` and `other`.
///
/// This is equivalent to the following, but is possibly more
/// efficient:
///
/// ```rust,ignore
/// f32x4::new(self.extract(0).max(other.extract(0)),
/// self.extract(1).max(other.extract(1)),
/// ...)
/// ```
#[inline]
pub fn max(self, other: Self) -> Self {
common::f32x4_max(self, other)
}
/// Compute the lane-wise minimum of `self` and `other`.
///
/// This is equivalent to the following, but is possibly more
/// efficient:
///
/// ```rust,ignore
/// f32x4::new(self.extract(0).min(other.extract(0)),
/// self.extract(1).min(other.extract(1)),
/// ...)
/// ```
#[inline]
pub fn min(self, other: Self) -> Self {
common::f32x4_min(self, other)
}
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i32(self) -> i32x4 {
unsafe {simd_cast(self)}
}
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u32(self) -> u32x4 {
unsafe {simd_cast(self)}
}
}
impl i16x8 {
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u16(self) -> u16x8 {
unsafe {simd_cast(self)}
}
}
impl u16x8 {
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i16(self) -> i16x8 {
unsafe {simd_cast(self)}
}
}
impl i8x16 {
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u8(self) -> u8x16 {
unsafe {simd_cast(self)}
}
}
impl u8x16 {
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i8(self) -> i8x16 {
unsafe {simd_cast(self)}
}
}
macro_rules! neg_impls {
($zero: expr, $($ty: ident,)*) => {
$(impl ops::Neg for $ty {
type Output = Self;
fn neg(self) -> Self {
$ty::splat($zero) - self
}
})*
}
}
neg_impls!{
0,
i32x4,
i16x8,
i8x16,
}
neg_impls! {
0.0,
f32x4,
}
macro_rules! not_impls {
($($ty: ident,)*) => {
$(impl ops::Not for $ty {
type Output = Self;
fn not(self) -> Self {
$ty::splat(!0) ^ self
}
})*
}
}
not_impls! {
i32x4,
i16x8,
i8x16,
u32x4,
u16x8,
u8x16,
}
macro_rules! operators {
($($trayt: ident ($func: ident, $method: ident): $($ty: ty),*;)*) => {
$(
$(impl ops::$trayt for $ty {
type Output = Self;
#[inline]
fn $method(self, x: Self) -> Self {
unsafe {$func(self, x)}
}
})*
)*
}
}
operators! {
Add (simd_add, add):
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
f32x4;
Sub (simd_sub, sub):
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
f32x4;
Mul (simd_mul, mul):
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
f32x4;
Div (simd_div, div): f32x4;
BitAnd (simd_and, bitand):
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
bool8ix16, bool16ix8, bool32ix4,
bool32fx4;
BitOr (simd_or, bitor):
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
bool8ix16, bool16ix8, bool32ix4,
bool32fx4;
BitXor (simd_xor, bitxor):
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
bool8ix16, bool16ix8, bool32ix4,
bool32fx4;
}
macro_rules! shift_one {
($ty: ident, $($by: ident),*) => {
$(
impl ops::Shl<$by> for $ty {
type Output = Self;
#[inline]
fn shl(self, other: $by) -> Self {
unsafe { simd_shl(self, $ty::splat(other as <$ty as Simd>::Elem)) }
}
}
impl ops::Shr<$by> for $ty {
type Output = Self;
#[inline]
fn shr(self, other: $by) -> Self {
unsafe {simd_shr(self, $ty::splat(other as <$ty as Simd>::Elem))}
}
}
)*
}
}
macro_rules! shift {
($($ty: ident),*) => {
$(shift_one! {
$ty,
u8, u16, u32, u64, usize,
i8, i16, i32, i64, isize
})*
}
}
shift! {
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4
}

225
third_party/rust/simd/src/lib.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,225 @@
//! `simd` offers a basic interface to the SIMD functionality of CPUs.
#![feature(cfg_target_feature, repr_simd, platform_intrinsics, const_fn)]
#![allow(non_camel_case_types)]
#[cfg(feature = "with-serde")]
extern crate serde;
#[cfg(feature = "with-serde")]
#[macro_use]
extern crate serde_derive;
/// Boolean type for 8-bit integers.
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct bool8i(i8);
/// Boolean type for 16-bit integers.
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct bool16i(i16);
/// Boolean type for 32-bit integers.
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct bool32i(i32);
/// Boolean type for 32-bit floats.
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct bool32f(i32);
macro_rules! bool {
($($name: ident, $inner: ty;)*) => {
$(
impl From<bool> for $name {
#[inline]
fn from(b: bool) -> $name {
$name(-(b as $inner))
}
}
impl From<$name> for bool {
#[inline]
fn from(b: $name) -> bool {
b.0 != 0
}
}
)*
}
}
bool! {
bool8i, i8;
bool16i, i16;
bool32i, i32;
bool32f, i32;
}
/// Types that are SIMD vectors.
pub unsafe trait Simd {
/// The corresponding boolean vector type.
type Bool: Simd;
/// The element that this vector stores.
type Elem;
}
/// A SIMD vector of 4 `u32`s.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct u32x4(u32, u32, u32, u32);
/// A SIMD vector of 4 `i32`s.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct i32x4(i32, i32, i32, i32);
/// A SIMD vector of 4 `f32`s.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct f32x4(f32, f32, f32, f32);
/// A SIMD boolean vector for length-4 vectors of 32-bit integers.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool32ix4(i32, i32, i32, i32);
/// A SIMD boolean vector for length-4 vectors of 32-bit floats.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool32fx4(i32, i32, i32, i32);
#[allow(dead_code)]
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
struct u32x2(u32, u32);
#[allow(dead_code)]
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
struct i32x2(i32, i32);
#[allow(dead_code)]
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
struct f32x2(f32, f32);
#[allow(dead_code)]
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
struct bool32ix2(i32, i32);
#[allow(dead_code)]
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
struct bool32fx2(i32, i32);
/// A SIMD vector of 8 `u16`s.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct u16x8(u16, u16, u16, u16,
u16, u16, u16, u16);
/// A SIMD vector of 8 `i16`s.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct i16x8(i16, i16, i16, i16,
i16, i16, i16, i16);
/// A SIMD boolean vector for length-8 vectors of 16-bit integers.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool16ix8(i16, i16, i16, i16,
i16, i16, i16, i16);
/// A SIMD vector of 16 `u8`s.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct u8x16(u8, u8, u8, u8, u8, u8, u8, u8,
u8, u8, u8, u8, u8, u8, u8, u8);
/// A SIMD vector of 16 `i8`s.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct i8x16(i8, i8, i8, i8, i8, i8, i8, i8,
i8, i8, i8, i8, i8, i8, i8, i8);
/// A SIMD boolean vector for length-16 vectors of 8-bit integers.
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool8ix16(i8, i8, i8, i8, i8, i8, i8, i8,
i8, i8, i8, i8, i8, i8, i8, i8);
macro_rules! simd {
($($bool: ty: $($ty: ty = $elem: ty),*;)*) => {
$($(unsafe impl Simd for $ty {
type Bool = $bool;
type Elem = $elem;
}
impl Clone for $ty { #[inline] fn clone(&self) -> Self { *self } }
)*)*}
}
simd! {
bool8ix16: i8x16 = i8, u8x16 = u8, bool8ix16 = bool8i;
bool16ix8: i16x8 = i16, u16x8 = u16, bool16ix8 = bool16i;
bool32ix4: i32x4 = i32, u32x4 = u32, bool32ix4 = bool32i;
bool32fx4: f32x4 = f32, bool32fx4 = bool32f;
bool32ix2: i32x2 = i32, u32x2 = u32, bool32ix2 = bool32i;
bool32fx2: f32x2 = f32, bool32fx2 = bool32f;
}
#[allow(dead_code)]
#[inline]
fn bitcast<T: Simd, U: Simd>(x: T) -> U {
assert_eq!(std::mem::size_of::<T>(),
std::mem::size_of::<U>());
unsafe {std::mem::transmute_copy(&x)}
}
#[allow(dead_code)]
extern "platform-intrinsic" {
fn simd_eq<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
fn simd_ne<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
fn simd_lt<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
fn simd_le<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
fn simd_gt<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
fn simd_ge<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
fn simd_shuffle2<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 2]) -> U;
fn simd_shuffle4<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 4]) -> U;
fn simd_shuffle8<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 8]) -> U;
fn simd_shuffle16<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 16]) -> U;
fn simd_insert<T: Simd<Elem = U>, U>(x: T, idx: u32, val: U) -> T;
fn simd_extract<T: Simd<Elem = U>, U>(x: T, idx: u32) -> U;
fn simd_cast<T: Simd, U: Simd>(x: T) -> U;
fn simd_add<T: Simd>(x: T, y: T) -> T;
fn simd_sub<T: Simd>(x: T, y: T) -> T;
fn simd_mul<T: Simd>(x: T, y: T) -> T;
fn simd_div<T: Simd>(x: T, y: T) -> T;
fn simd_shl<T: Simd>(x: T, y: T) -> T;
fn simd_shr<T: Simd>(x: T, y: T) -> T;
fn simd_and<T: Simd>(x: T, y: T) -> T;
fn simd_or<T: Simd>(x: T, y: T) -> T;
fn simd_xor<T: Simd>(x: T, y: T) -> T;
}
#[repr(packed)]
#[derive(Debug, Copy, Clone)]
struct Unalign<T>(T);
#[macro_use]
mod common;
mod sixty_four;
mod v256;
#[cfg(any(feature = "doc",
target_arch = "x86",
target_arch = "x86_64"))]
pub mod x86;
#[cfg(any(feature = "doc", target_arch = "arm"))]
pub mod arm;
#[cfg(any(feature = "doc", target_arch = "aarch64"))]
pub mod aarch64;

229
third_party/rust/simd/src/sixty_four.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,229 @@
#![allow(dead_code)]
use super::*;
#[allow(unused_imports)]
use super::{
f32x2,
simd_eq, simd_ne, simd_lt, simd_le, simd_gt, simd_ge,
simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
simd_insert, simd_extract,
simd_cast,
simd_add, simd_sub, simd_mul, simd_div, simd_shl, simd_shr, simd_and, simd_or, simd_xor,
Unalign, bitcast,
};
use std::mem;
use std::ops;
/// Boolean type for 64-bit integers.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy, Clone)]
pub struct bool64i(i64);
/// Boolean type for 64-bit floats.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy, Clone)]
pub struct bool64f(i64);
/// A SIMD vector of 2 `u64`s.
#[repr(simd)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct u64x2(u64, u64);
/// A SIMD vector of 2 `i64`s.
#[repr(simd)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct i64x2(i64, i64);
/// A SIMD vector of 2 `f64`s.
#[repr(simd)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct f64x2(f64, f64);
/// A SIMD boolean vector for length-2 vectors of 64-bit integers.
#[repr(simd)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool64ix2(i64, i64);
/// A SIMD boolean vector for length-2 vectors of 64-bit floats.
#[repr(simd)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool64fx2(i64, i64);
simd! {
bool64ix2: i64x2 = i64, u64x2 = u64, bool64ix2 = bool64i;
bool64fx2: f64x2 = f64, bool64fx2 = bool64f;
}
basic_impls! {
u64x2: u64, bool64ix2, simd_shuffle2, 2, x0 | x1;
i64x2: i64, bool64ix2, simd_shuffle2, 2, x0 | x1;
f64x2: f64, bool64fx2, simd_shuffle2, 2, x0 | x1;
}
mod common {
use super::*;
// naive for now
#[inline]
pub fn bool64ix2_all(x: bool64ix2) -> bool {
x.0 != 0 && x.1 != 0
}
#[inline]
pub fn bool64ix2_any(x: bool64ix2) -> bool {
x.0 != 0 || x.1 != 0
}
#[inline]
pub fn bool64fx2_all(x: bool64fx2) -> bool {
x.0 != 0 && x.1 != 0
}
#[inline]
pub fn bool64fx2_any(x: bool64fx2) -> bool {
x.0 != 0 || x.1 != 0
}}
bool_impls! {
bool64ix2: bool64i, i64x2, i64, 2, bool64ix2_all, bool64ix2_any, x0 | x1
[/// Convert `self` to a boolean vector for interacting with floating point vectors.
to_f -> bool64fx2];
bool64fx2: bool64f, i64x2, i64, 2, bool64fx2_all, bool64fx2_any, x0 | x1
[/// Convert `self` to a boolean vector for interacting with integer vectors.
to_i -> bool64ix2];
}
impl u64x2 {
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i64(self) -> i64x2 {
unsafe {simd_cast(self)}
}
/// Convert each lane to a 64-bit float.
#[inline]
pub fn to_f64(self) -> f64x2 {
unsafe {simd_cast(self)}
}
}
impl i64x2 {
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u64(self) -> u64x2 {
unsafe {simd_cast(self)}
}
/// Convert each lane to a 64-bit float.
#[inline]
pub fn to_f64(self) -> f64x2 {
unsafe {simd_cast(self)}
}
}
impl f64x2 {
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i64(self) -> i64x2 {
unsafe {simd_cast(self)}
}
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u64(self) -> u64x2 {
unsafe {simd_cast(self)}
}
/// Convert each lane to a 32-bit float.
#[inline]
pub fn to_f32(self) -> f32x4 {
unsafe {
let x: f32x2 = simd_cast(self);
f32x4::new(x.0, x.1, 0.0, 0.0)
}
}
}
neg_impls!{
0,
i64x2,
}
neg_impls! {
0.0,
f64x2,
}
macro_rules! not_impls {
($($ty: ident,)*) => {
$(impl ops::Not for $ty {
type Output = Self;
fn not(self) -> Self {
$ty::splat(!0) ^ self
}
})*
}
}
not_impls! {
i64x2,
u64x2,
}
macro_rules! operators {
($($trayt: ident ($func: ident, $method: ident): $($ty: ty),*;)*) => {
$(
$(impl ops::$trayt for $ty {
type Output = Self;
#[inline]
fn $method(self, x: Self) -> Self {
unsafe {$func(self, x)}
}
})*
)*
}
}
operators! {
Add (simd_add, add):
i64x2, u64x2,
f64x2;
Sub (simd_sub, sub):
i64x2, u64x2,
f64x2;
Mul (simd_mul, mul):
i64x2, u64x2,
f64x2;
Div (simd_div, div): f64x2;
BitAnd (simd_and, bitand):
i64x2, u64x2,
bool64ix2,
bool64fx2;
BitOr (simd_or, bitor):
i64x2, u64x2,
bool64ix2,
bool64fx2;
BitXor (simd_xor, bitxor):
i64x2, u64x2,
bool64ix2,
bool64fx2;
}
macro_rules! shift_one { ($ty: ident, $($by: ident),*) => {
$(
impl ops::Shl<$by> for $ty {
type Output = Self;
#[inline]
fn shl(self, other: $by) -> Self {
unsafe { simd_shl(self, $ty::splat(other as <$ty as Simd>::Elem)) }
}
}
impl ops::Shr<$by> for $ty {
type Output = Self;
#[inline]
fn shr(self, other: $by) -> Self {
unsafe {simd_shr(self, $ty::splat(other as <$ty as Simd>::Elem))}
}
}
)*
}
}
macro_rules! shift {
($($ty: ident),*) => {
$(shift_one! {
$ty,
u8, u16, u32, u64, usize,
i8, i16, i32, i64, isize
})*
}
}
shift! {
i64x2, u64x2
}

424
third_party/rust/simd/src/v256.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,424 @@
#![allow(dead_code)]
use std::ops;
use std::mem;
#[allow(unused_imports)]
use super::{
Simd,
u32x4, i32x4, u16x8, i16x8, u8x16, i8x16, f32x4,
bool32ix4, bool16ix8, bool8ix16, bool32fx4,
simd_eq, simd_ne, simd_lt, simd_le, simd_gt, simd_ge,
simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
simd_insert, simd_extract,
simd_cast,
simd_add, simd_sub, simd_mul, simd_div, simd_shl, simd_shr, simd_and, simd_or, simd_xor,
bool8i, bool16i, bool32i, bool32f,
Unalign, bitcast,
};
use super::sixty_four::*;
#[cfg(all(target_feature = "avx"))]
use super::x86::avx::common;
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct u64x4(u64, u64, u64, u64);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct i64x4(i64, i64, i64, i64);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct f64x4(f64, f64, f64, f64);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool64ix4(i64, i64, i64, i64);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool64fx4(i64, i64, i64, i64);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct u32x8(u32, u32, u32, u32,
u32, u32, u32, u32);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct i32x8(i32, i32, i32, i32,
i32, i32, i32, i32);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct f32x8(f32, f32, f32, f32,
f32, f32, f32, f32);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool32ix8(i32, i32, i32, i32,
i32, i32, i32, i32);#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool32fx8(i32, i32, i32, i32,
i32, i32, i32, i32);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct u16x16(u16, u16, u16, u16, u16, u16, u16, u16,
u16, u16, u16, u16, u16, u16, u16, u16);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct i16x16(i16, i16, i16, i16, i16, i16, i16, i16,
i16, i16, i16, i16, i16, i16, i16, i16);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool16ix16(i16, i16, i16, i16, i16, i16, i16, i16,
i16, i16, i16, i16, i16, i16, i16, i16);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct u8x32(u8, u8, u8, u8, u8, u8, u8, u8,
u8, u8, u8, u8, u8, u8, u8, u8,
u8, u8, u8, u8, u8, u8, u8, u8,
u8, u8, u8, u8, u8, u8, u8, u8);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct i8x32(i8, i8, i8, i8, i8, i8, i8, i8,
i8, i8, i8, i8, i8, i8, i8, i8,
i8, i8, i8, i8, i8, i8, i8, i8,
i8, i8, i8, i8, i8, i8, i8, i8);
#[repr(simd)]
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
#[derive(Debug, Copy)]
pub struct bool8ix32(i8, i8, i8, i8, i8, i8, i8, i8,
i8, i8, i8, i8, i8, i8, i8, i8,
i8, i8, i8, i8, i8, i8, i8, i8,
i8, i8, i8, i8, i8, i8, i8, i8);
simd! {
bool8ix32: i8x32 = i8, u8x32 = u8, bool8ix32 = bool8i;
bool16ix16: i16x16 = i16, u16x16 = u16, bool16ix16 = bool16i;
bool32ix8: i32x8 = i32, u32x8 = u32, bool32ix8 = bool32i;
bool64ix4: i64x4 = i64, u64x4 = u64, bool64ix4 = bool64i;
bool32fx8: f32x8 = f32, bool32fx8 = bool32f;
bool64fx4: f64x4 = f64, bool64fx4 = bool64f;
}
basic_impls! {
u64x4: u64, bool64ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
i64x4: i64, bool64ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
f64x4: f64, bool64fx4, simd_shuffle4, 4, x0, x1 | x2, x3;
u32x8: u32, bool32ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
i32x8: i32, bool32ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
f32x8: f32, bool32fx8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
u16x16: u16, bool16ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
i16x16: i16, bool16ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
u8x32: u8, bool8ix32, simd_shuffle32, 32, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31;
i8x32: i8, bool8ix32, simd_shuffle32, 32, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31;
}
#[cfg(all(not(target_feature = "avx")))]
#[doc(hidden)]
mod common {
use super::*;
// implementation via SSE vectors
macro_rules! bools {
($($ty: ty, $all: ident, $any: ident;)*) => {
$(
#[inline]
pub fn $all(x: $ty) -> bool {
x.low().all() && x.high().all()
}
#[inline]
pub fn $any(x: $ty) -> bool {
x.low().any() || x.high().any()
}
)*
}
}
bools! {
bool64ix4, bool64ix4_all, bool64ix4_any;
bool64fx4, bool64fx4_all, bool64fx4_any;
bool32ix8, bool32ix8_all, bool32ix8_any;
bool32fx8, bool32fx8_all, bool32fx8_any;
bool16ix16, bool16ix16_all, bool16ix16_any;
bool8ix32, bool8ix32_all, bool8ix32_any;
}
}
bool_impls! {
bool64ix4: bool64i, i64x4, i64, 4, bool64ix4_all, bool64ix4_any, x0, x1 | x2, x3
[/// Convert `self` to a boolean vector for interacting with floating point vectors.
to_f -> bool64fx4];
bool64fx4: bool64f, i64x4, i64, 4, bool64fx4_all, bool64fx4_any, x0, x1 | x2, x3
[/// Convert `self` to a boolean vector for interacting with integer vectors.
to_i -> bool64ix4];
bool32ix8: bool32i, i32x8, i32, 8, bool32ix8_all, bool32ix8_any, x0, x1, x2, x3 | x4, x5, x6, x7
[/// Convert `self` to a boolean vector for interacting with floating point vectors.
to_f -> bool32fx8];
bool32fx8: bool32f, i32x8, i32, 8, bool32fx8_all, bool32fx8_any, x0, x1, x2, x3 | x4, x5, x6, x7
[/// Convert `self` to a boolean vector for interacting with integer vectors.
to_i -> bool32ix8];
bool16ix16: bool16i, i16x16, i16, 16, bool16ix16_all, bool16ix16_any,
x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15 [];
bool8ix32: bool8i, i8x32, i8, 32, bool8ix32_all, bool8ix32_any,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 [];
}
pub trait LowHigh128 {
type Half: Simd;
/// Extract the low 128 bit part.
fn low(self) -> Self::Half;
/// Extract the high 128 bit part.
fn high(self) -> Self::Half;
}
macro_rules! expr { ($x:expr) => ($x) } // HACK
macro_rules! low_high_impls {
($(
$name: ident, $half: ident, $($first: tt),+ ... $($last: tt),+;
)*) => {
$(impl LowHigh128 for $name {
type Half = $half;
#[inline]
fn low(self) -> Self::Half {
$half::new($( expr!(self.$first), )*)
}
#[inline]
fn high(self) -> Self::Half {
$half::new($( expr!(self.$last), )*)
}
})*
}
}
low_high_impls! {
u64x4, u64x2, 0, 1 ... 2, 3;
i64x4, i64x2, 0, 1 ... 2, 3;
f64x4, f64x2, 0, 1 ... 2, 3;
u32x8, u32x4, 0, 1, 2, 3 ... 4, 5, 6, 7;
i32x8, i32x4, 0, 1, 2, 3 ... 4, 5, 6, 7;
f32x8, f32x4, 0, 1, 2, 3 ... 4, 5, 6, 7;
u16x16, u16x8, 0, 1, 2, 3, 4, 5, 6, 7 ... 8, 9, 10, 11, 12, 13, 14, 15;
i16x16, i16x8, 0, 1, 2, 3, 4, 5, 6, 7 ... 8, 9, 10, 11, 12, 13, 14, 15;
u8x32, u8x16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ...
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31;
i8x32, i8x16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ...
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31;
}
macro_rules! bool_low_high_impls {
($(
$name: ident: $half: ident;
)*) => {
$(impl LowHigh128 for $name {
type Half = $half;
/// Extract the low 128 bit part.
#[inline]
fn low(self) -> Self::Half {
Self::Half::from_repr(self.to_repr().low())
}
/// Extract the high 128 bit part.
#[inline]
fn high(self) -> Self::Half {
Self::Half::from_repr(self.to_repr().high())
}
})*
}
}
bool_low_high_impls! {
bool64fx4: bool64fx2;
bool32fx8: bool32fx4;
bool64ix4: bool64ix2;
bool32ix8: bool32ix4;
bool16ix16: bool16ix8;
bool8ix32: bool8ix16;
}
impl u64x4 {
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i64(self) -> i64x4 {
unsafe {simd_cast(self)}
}
/// Convert each lane to a 64-bit float.
#[inline]
pub fn to_f64(self) -> f64x4 {
unsafe {simd_cast(self)}
}
}
impl i64x4 {
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u64(self) -> u64x4 {
unsafe {simd_cast(self)}
}
/// Convert each lane to a 64-bit float.
#[inline]
pub fn to_f64(self) -> f64x4 {
unsafe {simd_cast(self)}
}
}
impl f64x4 {
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i64(self) -> i64x4 {
unsafe {simd_cast(self)}
}
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u64(self) -> u64x4 {
unsafe {simd_cast(self)}
}
}
impl u32x8 {
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i32(self) -> i32x8 {
unsafe {simd_cast(self)}
}
/// Convert each lane to a 32-bit float.
#[inline]
pub fn to_f32(self) -> f32x8 {
unsafe {simd_cast(self)}
}
}
impl i32x8 {
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u32(self) -> u32x8 {
unsafe {simd_cast(self)}
}
/// Convert each lane to a 32-bit float.
#[inline]
pub fn to_f32(self) -> f32x8 {
unsafe {simd_cast(self)}
}
}
impl i16x16 {
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u16(self) -> u16x16 {
unsafe {simd_cast(self)}
}
}
impl u16x16 {
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i16(self) -> i16x16 {
unsafe {simd_cast(self)}
}
}
impl i8x32 {
/// Convert each lane to an unsigned integer.
#[inline]
pub fn to_u8(self) -> u8x32 {
unsafe {simd_cast(self)}
}
}
impl u8x32 {
/// Convert each lane to a signed integer.
#[inline]
pub fn to_i8(self) -> i8x32 {
unsafe {simd_cast(self)}
}
}
operators! {
Add (simd_add, add):
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
f64x4, f32x8;
Sub (simd_sub, sub):
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
f64x4, f32x8;
Mul (simd_mul, mul):
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
f64x4, f32x8;
Div (simd_div, div): f64x4, f32x8;
BitAnd (simd_and, bitand):
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
bool64ix4, bool32ix8, bool16ix16,
bool64fx4, bool32fx8;
BitOr (simd_or, bitor):
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
bool64ix4, bool32ix8, bool16ix16,
bool64fx4, bool32fx8;
BitXor (simd_xor, bitxor):
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
bool64ix4, bool32ix8, bool16ix16,
bool64fx4, bool32fx8;
}
neg_impls!{
0,
i64x4,
i32x8,
i16x16,
i8x32,
}
neg_impls! {
0.0,
f64x4,
f32x8,
}
not_impls! {
i64x4,
u64x4,
i32x8,
u32x8,
i16x16,
u16x16,
i8x32,
u8x32,
}
shift! {
i64x4,
u64x4,
i32x8,
u32x8,
i16x16,
u16x16,
i8x32,
u8x32
}

290
third_party/rust/simd/src/x86/avx.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,290 @@
use super::super::*;
use sixty_four::*;
use super::super::bitcast;
pub use v256::{
f64x4, bool64fx4, u64x4, i64x4, bool64ix4,
f32x8, bool32fx8, u32x8, i32x8, bool32ix8,
u16x16, i16x16, bool16ix16,
u8x32, i8x32, bool8ix32,
LowHigh128
};
#[allow(dead_code)]
extern "platform-intrinsic" {
fn x86_mm256_addsub_ps(x: f32x8, y: f32x8) -> f32x8;
fn x86_mm256_addsub_pd(x: f64x4, y: f64x4) -> f64x4;
fn x86_mm256_dp_ps(x: f32x8, y: f32x8, z: i32) -> f32x8;
fn x86_mm256_hadd_ps(x: f32x8, y: f32x8) -> f32x8;
fn x86_mm256_hadd_pd(x: f64x4, y: f64x4) -> f64x4;
fn x86_mm256_hsub_ps(x: f32x8, y: f32x8) -> f32x8;
fn x86_mm256_hsub_pd(x: f64x4, y: f64x4) -> f64x4;
fn x86_mm256_max_ps(x: f32x8, y: f32x8) -> f32x8;
fn x86_mm256_max_pd(x: f64x4, y: f64x4) -> f64x4;
fn x86_mm256_min_ps(x: f32x8, y: f32x8) -> f32x8;
fn x86_mm256_min_pd(x: f64x4, y: f64x4) -> f64x4;
fn x86_mm256_movemask_ps(x: f32x8) -> i32;
fn x86_mm256_movemask_pd(x: f64x4) -> i32;
fn x86_mm_permutevar_ps(x: f32x4, y: i32x4) -> f32x4;
fn x86_mm_permutevar_pd(x: f64x2, y: i64x2) -> f64x2;
fn x86_mm256_permutevar_ps(x: f32x8, y: i32x8) -> f32x8;
fn x86_mm256_permutevar_pd(x: f64x4, y: i64x4) -> f64x4;
fn x86_mm256_rcp_ps(x: f32x8) -> f32x8;
fn x86_mm256_rsqrt_ps(x: f32x8) -> f32x8;
fn x86_mm256_sqrt_ps(x: f32x8) -> f32x8;
fn x86_mm256_sqrt_pd(x: f64x4) -> f64x4;
fn x86_mm_testc_ps(x: f32x4, y: f32x4) -> i32;
fn x86_mm256_testc_ps(x: f32x8, y: f32x8) -> i32;
fn x86_mm_testc_pd(x: f64x2, y: f64x2) -> i32;
fn x86_mm256_testc_pd(x: f64x4, y: f64x4) -> i32;
fn x86_mm256_testc_si256(x: u64x4, y: u64x4) -> i32;
fn x86_mm_testnzc_ps(x: f32x4, y: f32x4) -> i32;
fn x86_mm256_testnzc_ps(x: f32x8, y: f32x8) -> i32;
fn x86_mm_testnzc_pd(x: f64x2, y: f64x2) -> i32;
fn x86_mm256_testnzc_pd(x: f64x4, y: f64x4) -> i32;
fn x86_mm256_testnzc_si256(x: u64x4, y: u64x4) -> i32;
fn x86_mm_testz_ps(x: f32x4, y: f32x4) -> i32;
fn x86_mm256_testz_ps(x: f32x8, y: f32x8) -> i32;
fn x86_mm_testz_pd(x: f64x2, y: f64x2) -> i32;
fn x86_mm256_testz_pd(x: f64x4, y: f64x4) -> i32;
fn x86_mm256_testz_si256(x: u64x4, y: u64x4) -> i32;
}
#[doc(hidden)]
pub mod common {
use super::*;
use std::mem;
macro_rules! bools {
($($ty: ty, $all: ident, $any: ident, $testc: ident, $testz: ident;)*) => {
$(
#[inline]
pub fn $all(x: $ty) -> bool {
unsafe {
super::$testc(mem::transmute(x), mem::transmute(<$ty>::splat(true))) != 0
}
}
#[inline]
pub fn $any(x: $ty) -> bool {
unsafe {
super::$testz(mem::transmute(x), mem::transmute(x)) == 0
}
}
)*
}
}
bools! {
bool32fx8, bool32fx8_all, bool32fx8_any, x86_mm256_testc_ps, x86_mm256_testz_ps;
bool64fx4, bool64fx4_all, bool64fx4_any, x86_mm256_testc_pd, x86_mm256_testz_pd;
bool8ix32, bool8ix32_all, bool8ix32_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
bool16ix16, bool16ix16_all, bool16ix16_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
bool32ix8, bool32ix8_all, bool32ix8_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
bool64ix4, bool64ix4_all, bool64ix4_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
}
}
// 128-bit vectors:
// 32 bit floats
pub trait AvxF32x4 {
fn permutevar(self, other: i32x4) -> f32x4;
}
impl AvxF32x4 for f32x4 {
fn permutevar(self, other: i32x4) -> f32x4 {
unsafe { x86_mm_permutevar_ps(self, other) }
}
}
pub trait AvxF64x4 {
fn sqrt(self) -> Self;
fn addsub(self, other: Self) -> Self;
fn hadd(self, other: Self) -> Self;
fn hsub(self, other: Self) -> Self;
fn max(self, other: Self) -> Self;
fn min(self, other: Self) -> Self;
fn move_mask(self) -> u32;
}
impl AvxF64x4 for f64x4 {
#[inline]
fn sqrt(self) -> Self {
unsafe { x86_mm256_sqrt_pd(self) }
}
#[inline]
fn addsub(self, other: Self) -> Self {
unsafe { x86_mm256_addsub_pd(self, other) }
}
#[inline]
fn hadd(self, other: Self) -> Self {
unsafe { x86_mm256_hadd_pd(self, other) }
}
#[inline]
fn hsub(self, other: Self) -> Self {
unsafe { x86_mm256_hsub_pd(self, other) }
}
#[inline]
fn max(self, other: Self) -> Self {
unsafe { x86_mm256_max_pd(self, other) }
}
#[inline]
fn min(self, other: Self) -> Self {
unsafe { x86_mm256_min_pd(self, other) }
}
#[inline]
fn move_mask(self) -> u32 {
unsafe { x86_mm256_movemask_pd(self) as u32 }
}
}
pub trait AvxBool64fx4 {
fn move_mask(self) -> u32;
}
impl AvxBool64fx4 for bool64fx4 {
#[inline]
fn move_mask(self) -> u32 {
unsafe { x86_mm256_movemask_pd(bitcast(self)) as u32 }
}
}
pub trait AvxF32x8 {
fn sqrt(self) -> Self;
fn addsub(self, other: Self) -> Self;
fn hadd(self, other: Self) -> Self;
fn hsub(self, other: Self) -> Self;
fn max(self, other: Self) -> Self;
fn min(self, other: Self) -> Self;
fn move_mask(self) -> u32;
/// Compute an approximation to the reciprocal of the square root
/// of `self`, that is, `f32x8::splat(1.0) / self.sqrt()`.
///
/// The accuracy of this approximation is platform dependent.
fn approx_rsqrt(self) -> Self;
/// Compute an approximation to the reciprocal of `self`, that is,
/// `f32x8::splat(1.0) / self`.
///
/// The accuracy of this approximation is platform dependent.
fn approx_reciprocal(self) -> Self;
}
impl AvxF32x8 for f32x8 {
#[inline]
fn sqrt(self) -> Self {
unsafe { x86_mm256_sqrt_ps(self) }
}
#[inline]
fn addsub(self, other: Self) -> Self {
unsafe { x86_mm256_addsub_ps(self, other) }
}
#[inline]
fn hadd(self, other: Self) -> Self {
unsafe { x86_mm256_hadd_ps(self, other) }
}
#[inline]
fn hsub(self, other: Self) -> Self {
unsafe { x86_mm256_hsub_ps(self, other) }
}
#[inline]
fn max(self, other: Self) -> Self {
unsafe { x86_mm256_max_ps(self, other) }
}
#[inline]
fn min(self, other: Self) -> Self {
unsafe { x86_mm256_min_ps(self, other) }
}
#[inline]
fn move_mask(self) -> u32 {
unsafe { x86_mm256_movemask_ps(self) as u32 }
}
#[inline]
fn approx_reciprocal(self) -> Self {
unsafe { x86_mm256_rcp_ps(self) }
}
#[inline]
fn approx_rsqrt(self) -> Self {
unsafe { x86_mm256_rsqrt_ps(self) }
}
}
pub trait AvxBool32fx8 {
fn move_mask(self) -> u32;
}
impl AvxBool32fx8 for bool32fx8 {
#[inline]
fn move_mask(self) -> u32 {
unsafe { x86_mm256_movemask_ps(bitcast(self)) as u32 }
}
}
pub trait AvxBool32fx4 {}
impl AvxBool32fx4 for bool32fx4 {}
// 64 bit floats
pub trait AvxF64x2 {
fn permutevar(self, other: i64x2) -> f64x2;
}
impl AvxF64x2 for f64x2 {
fn permutevar(self, other: i64x2) -> f64x2 {
unsafe { x86_mm_permutevar_pd(self, other) }
}
}
pub trait AvxBool64fx2 {}
impl AvxBool64fx2 for bool64fx2 {}
// 64 bit integers
pub trait AvxU64x2 {}
impl AvxU64x2 for u64x2 {}
pub trait AvxI64x2 {}
impl AvxI64x2 for i64x2 {}
pub trait AvxBool64ix2 {}
impl AvxBool64ix2 for bool64ix2 {}
// 32 bit integers
pub trait AvxU32x4 {}
impl AvxU32x4 for u32x4 {}
pub trait AvxI32x4 {}
impl AvxI32x4 for i32x4 {}
pub trait AvxBool32ix4 {}
impl AvxBool32ix4 for bool32ix4 {}
// 16 bit integers
pub trait AvxU16x8 {}
impl AvxU16x8 for u16x8 {}
pub trait AvxI16x8 {}
impl AvxI16x8 for i16x8 {}
pub trait AvxBool16ix8 {}
impl AvxBool16ix8 for bool16ix8 {}
// 8 bit integers
pub trait AvxU8x16 {}
impl AvxU8x16 for u8x16 {}
pub trait AvxI8x16 {}
impl AvxI8x16 for i8x16 {}
pub trait AvxBool8ix16 {}
impl AvxBool8ix16 for bool8ix16 {}

65
third_party/rust/simd/src/x86/avx2.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,65 @@
use x86::avx::*;
#[allow(dead_code)]
extern "platform-intrinsic" {
fn x86_mm256_abs_epi8(x: i8x32) -> i8x32;
fn x86_mm256_abs_epi16(x: i16x16) -> i16x16;
fn x86_mm256_abs_epi32(x: i32x8) -> i32x8;
fn x86_mm256_adds_epi8(x: i8x32, y: i8x32) -> i8x32;
fn x86_mm256_adds_epu8(x: u8x32, y: u8x32) -> u8x32;
fn x86_mm256_adds_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_adds_epu16(x: u16x16, y: u16x16) -> u16x16;
fn x86_mm256_avg_epu8(x: u8x32, y: u8x32) -> u8x32;
fn x86_mm256_avg_epu16(x: u16x16, y: u16x16) -> u16x16;
fn x86_mm256_hadd_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_hadd_epi32(x: i32x8, y: i32x8) -> i32x8;
fn x86_mm256_hadds_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_hsub_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_hsub_epi32(x: i32x8, y: i32x8) -> i32x8;
fn x86_mm256_hsubs_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_madd_epi16(x: i16x16, y: i16x16) -> i32x8;
fn x86_mm256_maddubs_epi16(x: i8x32, y: i8x32) -> i16x16;
fn x86_mm256_max_epi8(x: i8x32, y: i8x32) -> i8x32;
fn x86_mm256_max_epu8(x: u8x32, y: u8x32) -> u8x32;
fn x86_mm256_max_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_max_epu16(x: u16x16, y: u16x16) -> u16x16;
fn x86_mm256_max_epi32(x: i32x8, y: i32x8) -> i32x8;
fn x86_mm256_max_epu32(x: u32x8, y: u32x8) -> u32x8;
fn x86_mm256_min_epi8(x: i8x32, y: i8x32) -> i8x32;
fn x86_mm256_min_epu8(x: u8x32, y: u8x32) -> u8x32;
fn x86_mm256_min_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_min_epu16(x: u16x16, y: u16x16) -> u16x16;
fn x86_mm256_min_epi32(x: i32x8, y: i32x8) -> i32x8;
fn x86_mm256_min_epu32(x: u32x8, y: u32x8) -> u32x8;
fn x86_mm256_mul_epi64(x: i32x8, y: i32x8) -> i64x4;
fn x86_mm256_mul_epu64(x: u32x8, y: u32x8) -> u64x4;
fn x86_mm256_mulhi_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_mulhi_epu16(x: u16x16, y: u16x16) -> u16x16;
fn x86_mm256_mulhrs_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_packs_epi16(x: i16x16, y: i16x16) -> i8x32;
fn x86_mm256_packus_epi16(x: i16x16, y: i16x16) -> u8x32;
fn x86_mm256_packs_epi32(x: i32x8, y: i32x8) -> i16x16;
fn x86_mm256_packus_epi32(x: i32x8, y: i32x8) -> u16x16;
fn x86_mm256_permutevar8x32_epi32(x: i32x8, y: i32x8) -> i32x8;
fn x86_mm256_permutevar8x32_ps(x: f32x8, y: i32x8) -> f32x8;
fn x86_mm256_sad_epu8(x: u8x32, y: u8x32) -> u8x32;
fn x86_mm256_shuffle_epi8(x: i8x32, y: i8x32) -> i8x32;
fn x86_mm256_sign_epi8(x: i8x32, y: i8x32) -> i8x32;
fn x86_mm256_sign_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_sign_epi32(x: i32x8, y: i32x8) -> i32x8;
fn x86_mm256_subs_epi8(x: i8x32, y: i8x32) -> i8x32;
fn x86_mm256_subs_epu8(x: u8x32, y: u8x32) -> u8x32;
fn x86_mm256_subs_epi16(x: i16x16, y: i16x16) -> i16x16;
fn x86_mm256_subs_epu16(x: u16x16, y: u16x16) -> u16x16;
}
// broken on rustc 1.7.0-nightly (1ddaf8bdf 2015-12-12)
// pub trait Avx2F32x8 {
// fn permutevar(self, other: i32x8) -> f32x8;
// }
//
// impl Avx2F32x8 for f32x8 {
// fn permutevar(self, other: i32x8) -> f32x8 {
// unsafe { x86_mm256_permutevar8x32_ps(self, other) }
// }
// }

16
third_party/rust/simd/src/x86/mod.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,16 @@
//! Features specific to x86 and x86-64 CPUs.
#[cfg(any(feature = "doc", target_feature = "sse2"))]
pub mod sse2;
#[cfg(any(feature = "doc", target_feature = "sse3"))]
pub mod sse3;
#[cfg(any(feature = "doc", target_feature = "ssse3"))]
pub mod ssse3;
#[cfg(any(feature = "doc", target_feature = "sse4.1"))]
pub mod sse4_1;
#[cfg(any(feature = "doc", target_feature = "sse4.2"))]
pub mod sse4_2;
#[cfg(any(feature = "doc", target_feature = "avx"))]
pub mod avx;
#[cfg(any(feature = "doc", target_feature = "avx2"))]
pub mod avx2;

359
third_party/rust/simd/src/x86/sse2.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,359 @@
use super::super::*;
use {bitcast, simd_cast, f32x2};
pub use sixty_four::{f64x2, i64x2, u64x2, bool64ix2, bool64fx2};
//pub use super::{u64x2, i64x2, f64x2, bool64ix2, bool64fx2};
// strictly speaking, these are SSE instructions, not SSE2.
extern "platform-intrinsic" {
fn x86_mm_movemask_ps(x: f32x4) -> i32;
fn x86_mm_max_ps(x: f32x4, y: f32x4) -> f32x4;
fn x86_mm_min_ps(x: f32x4, y: f32x4) -> f32x4;
fn x86_mm_rsqrt_ps(x: f32x4) -> f32x4;
fn x86_mm_rcp_ps(x: f32x4) -> f32x4;
fn x86_mm_sqrt_ps(x: f32x4) -> f32x4;
}
extern "platform-intrinsic" {
fn x86_mm_adds_epi8(x: i8x16, y: i8x16) -> i8x16;
fn x86_mm_adds_epu8(x: u8x16, y: u8x16) -> u8x16;
fn x86_mm_adds_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_adds_epu16(x: u16x8, y: u16x8) -> u16x8;
fn x86_mm_avg_epu8(x: u8x16, y: u8x16) -> u8x16;
fn x86_mm_avg_epu16(x: u16x8, y: u16x8) -> u16x8;
fn x86_mm_madd_epi16(x: i16x8, y: i16x8) -> i32x4;
fn x86_mm_max_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_max_epu8(x: u8x16, y: u8x16) -> u8x16;
fn x86_mm_max_pd(x: f64x2, y: f64x2) -> f64x2;
fn x86_mm_min_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_min_epu8(x: u8x16, y: u8x16) -> u8x16;
fn x86_mm_min_pd(x: f64x2, y: f64x2) -> f64x2;
fn x86_mm_movemask_pd(x: f64x2) -> i32;
fn x86_mm_movemask_epi8(x: i8x16) -> i32;
fn x86_mm_mul_epu32(x: u32x4, y: u32x4) -> u64x2;
fn x86_mm_mulhi_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_mulhi_epu16(x: u16x8, y: u16x8) -> u16x8;
fn x86_mm_packs_epi16(x: i16x8, y: i16x8) -> i8x16;
fn x86_mm_packs_epi32(x: i32x4, y: i32x4) -> i16x8;
fn x86_mm_packus_epi16(x: i16x8, y: i16x8) -> u8x16;
fn x86_mm_sad_epu8(x: u8x16, y: u8x16) -> u64x2;
fn x86_mm_sqrt_pd(x: f64x2) -> f64x2;
fn x86_mm_subs_epi8(x: i8x16, y: i8x16) -> i8x16;
fn x86_mm_subs_epu8(x: u8x16, y: u8x16) -> u8x16;
fn x86_mm_subs_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_subs_epu16(x: u16x8, y: u16x8) -> u16x8;
}
#[doc(hidden)]
pub mod common {
use super::super::super::*;
use std::mem;
#[inline]
pub fn f32x4_sqrt(x: f32x4) -> f32x4 {
unsafe {super::x86_mm_sqrt_ps(x)}
}
#[inline]
pub fn f32x4_approx_rsqrt(x: f32x4) -> f32x4 {
unsafe {super::x86_mm_rsqrt_ps(x)}
}
#[inline]
pub fn f32x4_approx_reciprocal(x: f32x4) -> f32x4 {
unsafe {super::x86_mm_rcp_ps(x)}
}
#[inline]
pub fn f32x4_max(x: f32x4, y: f32x4) -> f32x4 {
unsafe {super::x86_mm_max_ps(x, y)}
}
#[inline]
pub fn f32x4_min(x: f32x4, y: f32x4) -> f32x4 {
unsafe {super::x86_mm_min_ps(x, y)}
}
macro_rules! bools {
($($ty: ty, $all: ident, $any: ident, $movemask: ident, $width: expr;)*) => {
$(
#[inline]
pub fn $all(x: $ty) -> bool {
unsafe {
super::$movemask(mem::transmute(x)) == (1 << $width) - 1
}
}
#[inline]
pub fn $any(x: $ty) -> bool {
unsafe {
super::$movemask(mem::transmute(x)) != 0
}
}
)*
}
}
bools! {
bool32fx4, bool32fx4_all, bool32fx4_any, x86_mm_movemask_ps, 4;
bool8ix16, bool8ix16_all, bool8ix16_any, x86_mm_movemask_epi8, 16;
bool16ix8, bool16ix8_all, bool16ix8_any, x86_mm_movemask_epi8, 16;
bool32ix4, bool32ix4_all, bool32ix4_any, x86_mm_movemask_epi8, 16;
}
}
// 32 bit floats
pub trait Sse2F32x4 {
fn to_f64(self) -> f64x2;
fn move_mask(self) -> u32;
}
impl Sse2F32x4 for f32x4 {
#[inline]
fn to_f64(self) -> f64x2 {
unsafe {
simd_cast(f32x2(self.0, self.1))
}
}
fn move_mask(self) -> u32 {
unsafe {x86_mm_movemask_ps(self) as u32}
}
}
pub trait Sse2Bool32fx4 {
fn move_mask(self) -> u32;
}
impl Sse2Bool32fx4 for bool32fx4 {
#[inline]
fn move_mask(self) -> u32 {
unsafe { x86_mm_movemask_ps(bitcast(self)) as u32}
}
}
// 64 bit floats
pub trait Sse2F64x2 {
fn move_mask(self) -> u32;
fn sqrt(self) -> Self;
fn max(self, other: Self) -> Self;
fn min(self, other: Self) -> Self;
}
impl Sse2F64x2 for f64x2 {
#[inline]
fn move_mask(self) -> u32 {
unsafe { x86_mm_movemask_pd(bitcast(self)) as u32}
}
#[inline]
fn sqrt(self) -> Self {
unsafe { x86_mm_sqrt_pd(self) }
}
#[inline]
fn max(self, other: Self) -> Self {
unsafe { x86_mm_max_pd(self, other) }
}
#[inline]
fn min(self, other: Self) -> Self {
unsafe { x86_mm_min_pd(self, other) }
}
}
pub trait Sse2Bool64fx2 {
fn move_mask(self) -> u32;
}
impl Sse2Bool64fx2 for bool64fx2 {
#[inline]
fn move_mask(self) -> u32 {
unsafe { x86_mm_movemask_pd(bitcast(self)) as u32}
}
}
// 64 bit ints
pub trait Sse2U64x2 {}
impl Sse2U64x2 for u64x2 {}
pub trait Sse2I64x2 {}
impl Sse2I64x2 for i64x2 {}
pub trait Sse2Bool64ix2 {}
impl Sse2Bool64ix2 for bool64ix2 {}
// 32 bit ints
pub trait Sse2U32x4 {
fn low_mul(self, other: Self) -> u64x2;
}
impl Sse2U32x4 for u32x4 {
#[inline]
fn low_mul(self, other: Self) -> u64x2 {
unsafe { x86_mm_mul_epu32(self, other) }
}
}
pub trait Sse2I32x4 {
fn packs(self, other: Self) -> i16x8;
}
impl Sse2I32x4 for i32x4 {
#[inline]
fn packs(self, other: Self) -> i16x8 {
unsafe { x86_mm_packs_epi32(self, other) }
}
}
pub trait Sse2Bool32ix4 {}
impl Sse2Bool32ix4 for bool32ix4 {}
// 16 bit ints
pub trait Sse2U16x8 {
fn adds(self, other: Self) -> Self;
fn subs(self, other: Self) -> Self;
fn avg(self, other: Self) -> Self;
fn mulhi(self, other: Self) -> Self;
}
impl Sse2U16x8 for u16x8 {
#[inline]
fn adds(self, other: Self) -> Self {
unsafe { x86_mm_adds_epu16(self, other) }
}
#[inline]
fn subs(self, other: Self) -> Self {
unsafe { x86_mm_subs_epu16(self, other) }
}
#[inline]
fn avg(self, other: Self) -> Self {
unsafe { x86_mm_avg_epu16(self, other) }
}
#[inline]
fn mulhi(self, other: Self) -> Self {
unsafe { x86_mm_mulhi_epu16(self, other) }
}
}
pub trait Sse2I16x8 {
fn adds(self, other: Self) -> Self;
fn subs(self, other: Self) -> Self;
fn madd(self, other: Self) -> i32x4;
fn max(self, other: Self) -> Self;
fn min(self, other: Self) -> Self;
fn mulhi(self, other: Self) -> Self;
fn packs(self, other: Self) -> i8x16;
fn packus(self, other: Self) -> u8x16;
}
impl Sse2I16x8 for i16x8 {
#[inline]
fn adds(self, other: Self) -> Self {
unsafe { x86_mm_adds_epi16(self, other) }
}
#[inline]
fn subs(self, other: Self) -> Self {
unsafe { x86_mm_subs_epi16(self, other) }
}
#[inline]
fn madd(self, other: Self) -> i32x4 {
unsafe { x86_mm_madd_epi16(self, other) }
}
#[inline]
fn max(self, other: Self) -> Self {
unsafe { x86_mm_max_epi16(self, other) }
}
#[inline]
fn min(self, other: Self) -> Self {
unsafe { x86_mm_min_epi16(self, other) }
}
#[inline]
fn mulhi(self, other: Self) -> Self {
unsafe { x86_mm_mulhi_epi16(self, other) }
}
#[inline]
fn packs(self, other: Self) -> i8x16 {
unsafe { x86_mm_packs_epi16(self, other) }
}
#[inline]
fn packus(self, other: Self) -> u8x16 {
unsafe { x86_mm_packus_epi16(self, other) }
}
}
pub trait Sse2Bool16ix8 {}
impl Sse2Bool16ix8 for bool16ix8 {}
// 8 bit ints
pub trait Sse2U8x16 {
fn move_mask(self) -> u32;
fn adds(self, other: Self) -> Self;
fn subs(self, other: Self) -> Self;
fn avg(self, other: Self) -> Self;
fn max(self, other: Self) -> Self;
fn min(self, other: Self) -> Self;
fn sad(self, other: Self) -> u64x2;
}
impl Sse2U8x16 for u8x16 {
#[inline]
fn move_mask(self) -> u32 {
unsafe { x86_mm_movemask_epi8(bitcast(self)) as u32}
}
#[inline]
fn adds(self, other: Self) -> Self {
unsafe { x86_mm_adds_epu8(self, other) }
}
#[inline]
fn subs(self, other: Self) -> Self {
unsafe { x86_mm_subs_epu8(self, other) }
}
#[inline]
fn avg(self, other: Self) -> Self {
unsafe { x86_mm_avg_epu8(self, other) }
}
#[inline]
fn max(self, other: Self) -> Self {
unsafe { x86_mm_max_epu8(self, other) }
}
#[inline]
fn min(self, other: Self) -> Self {
unsafe { x86_mm_min_epu8(self, other) }
}
#[inline]
fn sad(self, other: Self) -> u64x2 {
unsafe { x86_mm_sad_epu8(self, other) }
}
}
pub trait Sse2I8x16 {
fn move_mask(self) -> u32;
fn adds(self, other: Self) -> Self;
fn subs(self, other: Self) -> Self;
}
impl Sse2I8x16 for i8x16 {
#[inline]
fn move_mask(self) -> u32 {
unsafe { x86_mm_movemask_epi8(bitcast(self)) as u32}
}
#[inline]
fn adds(self, other: Self) -> Self {
unsafe { x86_mm_adds_epi8(self, other) }
}
#[inline]
fn subs(self, other: Self) -> Self {
unsafe { x86_mm_subs_epi8(self, other) }
}
}
pub trait Sse2Bool8ix16 {
fn move_mask(self) -> u32;
}
impl Sse2Bool8ix16 for bool8ix16 {
#[inline]
fn move_mask(self) -> u32 {
unsafe { x86_mm_movemask_epi8(bitcast(self)) as u32}
}
}

57
third_party/rust/simd/src/x86/sse3.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,57 @@
use sixty_four::*;
use super::super::*;
extern "platform-intrinsic" {
fn x86_mm_addsub_ps(x: f32x4, y: f32x4) -> f32x4;
fn x86_mm_addsub_pd(x: f64x2, y: f64x2) -> f64x2;
fn x86_mm_hadd_ps(x: f32x4, y: f32x4) -> f32x4;
fn x86_mm_hadd_pd(x: f64x2, y: f64x2) -> f64x2;
fn x86_mm_hsub_ps(x: f32x4, y: f32x4) -> f32x4;
fn x86_mm_hsub_pd(x: f64x2, y: f64x2) -> f64x2;
}
pub trait Sse3F32x4 {
fn addsub(self, other: Self) -> Self;
fn hadd(self, other: Self) -> Self;
fn hsub(self, other: Self) -> Self;
}
impl Sse3F32x4 for f32x4 {
#[inline]
fn addsub(self, other: Self) -> Self {
unsafe { x86_mm_addsub_ps(self, other) }
}
#[inline]
fn hadd(self, other: Self) -> Self {
unsafe { x86_mm_hadd_ps(self, other) }
}
#[inline]
fn hsub(self, other: Self) -> Self {
unsafe { x86_mm_hsub_ps(self, other) }
}
}
pub trait Sse3F64x2 {
fn addsub(self, other: Self) -> Self;
fn hadd(self, other: Self) -> Self;
fn hsub(self, other: Self) -> Self;
}
impl Sse3F64x2 for f64x2 {
#[inline]
fn addsub(self, other: Self) -> Self {
unsafe { x86_mm_addsub_pd(self, other) }
}
#[inline]
fn hadd(self, other: Self) -> Self {
unsafe { x86_mm_hadd_pd(self, other) }
}
#[inline]
fn hsub(self, other: Self) -> Self {
unsafe { x86_mm_hsub_pd(self, other) }
}
}

155
third_party/rust/simd/src/x86/sse4_1.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,155 @@
use super::super::*;
use x86::sse2::*;
#[allow(dead_code)]
extern "platform-intrinsic" {
fn x86_mm_dp_ps(x: f32x4, y: f32x4, z: i32) -> f32x4;
fn x86_mm_dp_pd(x: f64x2, y: f64x2, z: i32) -> f64x2;
fn x86_mm_max_epi8(x: i8x16, y: i8x16) -> i8x16;
fn x86_mm_max_epu16(x: u16x8, y: u16x8) -> u16x8;
fn x86_mm_max_epi32(x: i32x4, y: i32x4) -> i32x4;
fn x86_mm_max_epu32(x: u32x4, y: u32x4) -> u32x4;
fn x86_mm_min_epi8(x: i8x16, y: i8x16) -> i8x16;
fn x86_mm_min_epu16(x: u16x8, y: u16x8) -> u16x8;
fn x86_mm_min_epi32(x: i32x4, y: i32x4) -> i32x4;
fn x86_mm_min_epu32(x: u32x4, y: u32x4) -> u32x4;
fn x86_mm_minpos_epu16(x: u16x8) -> u16x8;
fn x86_mm_mpsadbw_epu8(x: u8x16, y: u8x16, z: i32) -> u16x8;
fn x86_mm_mul_epi32(x: i32x4, y: i32x4) -> i64x2;
fn x86_mm_packus_epi32(x: i32x4, y: i32x4) -> u16x8;
fn x86_mm_testc_si128(x: u64x2, y: u64x2) -> i32;
fn x86_mm_testnzc_si128(x: u64x2, y: u64x2) -> i32;
fn x86_mm_testz_si128(x: u64x2, y: u64x2) -> i32;
}
// 32 bit floats
pub trait Sse41F32x4 {}
impl Sse41F32x4 for f32x4 {}
// 64 bit floats
pub trait Sse41F64x2 {}
impl Sse41F64x2 for f64x2 {}
// 64 bit integers
pub trait Sse41U64x2 {
fn testc(self, other: Self) -> i32;
fn testnzc(self, other: Self) -> i32;
fn testz(self, other: Self) -> i32;
}
impl Sse41U64x2 for u64x2 {
#[inline]
fn testc(self, other: Self) -> i32 {
unsafe { x86_mm_testc_si128(self, other) }
}
#[inline]
fn testnzc(self, other: Self) -> i32 {
unsafe { x86_mm_testnzc_si128(self, other) }
}
#[inline]
fn testz(self, other: Self) -> i32 {
unsafe { x86_mm_testz_si128(self, other) }
}
}
pub trait Sse41I64x2 {}
impl Sse41I64x2 for i64x2 {}
pub trait Sse41Bool64ix2 {}
impl Sse41Bool64ix2 for bool64ix2 {}
// 32 bit integers
pub trait Sse41U32x4 {
fn max(self, other: Self) -> Self;
fn min(self, other: Self) -> Self;
}
impl Sse41U32x4 for u32x4 {
#[inline]
fn max(self, other: Self) -> Self {
unsafe { x86_mm_max_epu32(self, other) }
}
#[inline]
fn min(self, other: Self) -> Self {
unsafe { x86_mm_min_epu32(self, other) }
}
}
pub trait Sse41I32x4 {
fn max(self, other: Self) -> Self;
fn min(self, other: Self) -> Self;
fn low_mul(self, other: Self) -> i64x2;
fn packus(self, other: Self) -> u16x8;
}
impl Sse41I32x4 for i32x4 {
#[inline]
fn max(self, other: Self) -> Self {
unsafe { x86_mm_max_epi32(self, other) }
}
#[inline]
fn min(self, other: Self) -> Self {
unsafe { x86_mm_min_epi32(self, other) }
}
#[inline]
fn low_mul(self, other: Self) -> i64x2 {
unsafe { x86_mm_mul_epi32(self, other) }
}
#[inline]
fn packus(self, other: Self) -> u16x8 {
unsafe { x86_mm_packus_epi32(self, other) }
}
}
pub trait Sse41Bool32ix4 {}
impl Sse41Bool32ix4 for bool32ix4 {}
// 16 bit integers
pub trait Sse41U16x8 {
fn max(self, other: Self) -> Self;
fn min(self, other: Self) -> Self;
fn minpos(self) -> Self;
}
impl Sse41U16x8 for u16x8 {
#[inline]
fn max(self, other: Self) -> Self {
unsafe { x86_mm_max_epu16(self, other) }
}
#[inline]
fn min(self, other: Self) -> Self {
unsafe { x86_mm_min_epu16(self, other) }
}
#[inline]
fn minpos(self) -> Self {
unsafe { x86_mm_minpos_epu16(self) }
}
}
pub trait Sse41I16x8 {}
impl Sse41I16x8 for i16x8 {}
pub trait Sse41Bool16ix8 {}
impl Sse41Bool16ix8 for bool16ix8 {}
// 8 bit integers
pub trait Sse41U8x16 {}
impl Sse41U8x16 for u8x16 {}
pub trait Sse41I8x16 {
fn max(self, other: Self) -> Self;
fn min(self, other: Self) -> Self;
}
impl Sse41I8x16 for i8x16 {
#[inline]
fn max(self, other: Self) -> Self {
unsafe { x86_mm_max_epi8(self, other) }
}
#[inline]
fn min(self, other: Self) -> Self {
unsafe { x86_mm_min_epi8(self, other) }
}
}
pub trait Sse41Bool8ix16 {}
impl Sse41Bool8ix16 for bool8ix16 {}

19
third_party/rust/simd/src/x86/sse4_2.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,19 @@
use i8x16;
#[allow(dead_code)]
extern "platform-intrinsic" {
fn x86_mm_cmpestra(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
fn x86_mm_cmpestrc(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
fn x86_mm_cmpestri(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
fn x86_mm_cmpestrm(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i8x16;
fn x86_mm_cmpestro(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
fn x86_mm_cmpestrs(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
fn x86_mm_cmpestrz(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
fn x86_mm_cmpistra(x: i8x16, y: i8x16, z: i32) -> i32;
fn x86_mm_cmpistrc(x: i8x16, y: i8x16, z: i32) -> i32;
fn x86_mm_cmpistri(x: i8x16, y: i8x16, z: i32) -> i32;
fn x86_mm_cmpistrm(x: i8x16, y: i8x16, z: i32) -> i8x16;
fn x86_mm_cmpistro(x: i8x16, y: i8x16, z: i32) -> i32;
fn x86_mm_cmpistrs(x: i8x16, y: i8x16, z: i32) -> i32;
fn x86_mm_cmpistrz(x: i8x16, y: i8x16, z: i32) -> i32;
}

172
third_party/rust/simd/src/x86/ssse3.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,172 @@
use super::super::*;
use bitcast;
macro_rules! bitcast {
($func: ident($a: ident, $b: ident)) => {
bitcast($func(bitcast($a), bitcast($b)))
}
}
extern "platform-intrinsic" {
fn x86_mm_abs_epi8(x: i8x16) -> i8x16;
fn x86_mm_abs_epi16(x: i16x8) -> i16x8;
fn x86_mm_abs_epi32(x: i32x4) -> i32x4;
fn x86_mm_hadd_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_hadd_epi32(x: i32x4, y: i32x4) -> i32x4;
fn x86_mm_hadds_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_hsub_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_hsub_epi32(x: i32x4, y: i32x4) -> i32x4;
fn x86_mm_hsubs_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_maddubs_epi16(x: u8x16, y: i8x16) -> i16x8;
fn x86_mm_mulhrs_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_shuffle_epi8(x: i8x16, y: i8x16) -> i8x16;
fn x86_mm_sign_epi8(x: i8x16, y: i8x16) -> i8x16;
fn x86_mm_sign_epi16(x: i16x8, y: i16x8) -> i16x8;
fn x86_mm_sign_epi32(x: i32x4, y: i32x4) -> i32x4;
}
// 32 bit integers
pub trait Ssse3I32x4 {
fn abs(self) -> Self;
fn hadd(self, other: Self) -> Self;
fn hsub(self, other: Self) -> Self;
fn sign(self, other: Self) -> Self;
}
impl Ssse3I32x4 for i32x4 {
#[inline]
fn abs(self) -> Self {
unsafe { x86_mm_abs_epi32(self) }
}
#[inline]
fn hadd(self, other: Self) -> Self {
unsafe { x86_mm_hadd_epi32(self, other) }
}
#[inline]
fn hsub(self, other: Self) -> Self {
unsafe { x86_mm_hsub_epi32(self, other) }
}
#[inline]
fn sign(self, other: Self) -> Self {
unsafe { x86_mm_sign_epi32(self, other) }
}
}
pub trait Ssse3U32x4 {
fn hadd(self, other: Self) -> Self;
fn hsub(self, other: Self) -> Self;
}
impl Ssse3U32x4 for u32x4 {
#[inline]
fn hadd(self, other: Self) -> Self {
unsafe { bitcast!(x86_mm_hadd_epi32(self, other)) }
}
#[inline]
fn hsub(self, other: Self) -> Self {
unsafe { bitcast!(x86_mm_hsub_epi32(self, other)) }
}
}
// 16 bit integers
pub trait Ssse3I16x8 {
fn abs(self) -> Self;
fn hadd(self, other: Self) -> Self;
fn hadds(self, other: Self) -> Self;
fn hsub(self, other: Self) -> Self;
fn hsubs(self, other: Self) -> Self;
fn sign(self, other: Self) -> Self;
fn mulhrs(self, other: Self) -> Self;
}
impl Ssse3I16x8 for i16x8 {
#[inline]
fn abs(self) -> Self {
unsafe { x86_mm_abs_epi16(self) }
}
#[inline]
fn hadd(self, other: Self) -> Self {
unsafe { x86_mm_hadd_epi16(self, other) }
}
#[inline]
fn hadds(self, other: Self) -> Self {
unsafe { x86_mm_hadds_epi16(self, other) }
}
#[inline]
fn hsub(self, other: Self) -> Self {
unsafe { x86_mm_hsub_epi16(self, other) }
}
#[inline]
fn hsubs(self, other: Self) -> Self {
unsafe { x86_mm_hsubs_epi16(self, other) }
}
#[inline]
fn sign(self, other: Self) -> Self {
unsafe { x86_mm_sign_epi16(self, other) }
}
#[inline]
fn mulhrs(self, other: Self) -> Self {
unsafe { x86_mm_mulhrs_epi16(self, other) }
}
}
pub trait Ssse3U16x8 {
fn hadd(self, other: Self) -> Self;
fn hsub(self, other: Self) -> Self;
}
impl Ssse3U16x8 for u16x8 {
#[inline]
fn hadd(self, other: Self) -> Self {
unsafe { bitcast!(x86_mm_hadd_epi16(self, other)) }
}
#[inline]
fn hsub(self, other: Self) -> Self {
unsafe { bitcast!(x86_mm_hsub_epi16(self, other)) }
}
}
// 8 bit integers
pub trait Ssse3U8x16 {
fn shuffle_bytes(self, indices: Self) -> Self;
fn maddubs(self, other: i8x16) -> i16x8;
}
impl Ssse3U8x16 for u8x16 {
#[inline]
fn shuffle_bytes(self, indices: Self) -> Self {
unsafe {bitcast!(x86_mm_shuffle_epi8(self, indices))}
}
fn maddubs(self, other: i8x16) -> i16x8 {
unsafe { x86_mm_maddubs_epi16(self, other) }
}
}
pub trait Ssse3I8x16 {
fn abs(self) -> Self;
fn shuffle_bytes(self, indices: Self) -> Self;
fn sign(self, other: Self) -> Self;
}
impl Ssse3I8x16 for i8x16 {
#[inline]
fn abs(self) -> Self {
unsafe {x86_mm_abs_epi8(self)}
}
#[inline]
fn shuffle_bytes(self, indices: Self) -> Self {
unsafe {
x86_mm_shuffle_epi8(self, indices)
}
}
#[inline]
fn sign(self, other: Self) -> Self {
unsafe { x86_mm_sign_epi8(self, other) }
}
}

7
toolkit/library/gtest/rust/Cargo.lock сгенерированный
Просмотреть файл

@ -301,6 +301,7 @@ version = "0.6.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -852,6 +853,11 @@ dependencies = [
"nodrop 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "simd"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "siphasher"
version = "0.2.1"
@ -1295,6 +1301,7 @@ dependencies = [
"checksum serde 0.9.9 (registry+https://github.com/rust-lang/crates.io-index)" = "05a67b8a53f885f4b6e3ed183806035819f9862474e747fe4488a6d63bcbfcb7"
"checksum serde_codegen_internals 0.14.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d52006899f910528a10631e5b727973fe668f3228109d1707ccf5bad5490b6e"
"checksum serde_derive 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)" = "f15ea24bd037b2d64646b4d934fa99c649be66e3f7b29fb595a5543b212b1452"
"checksum simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"
"checksum siphasher 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2ffc669b726f2bc9a3bcff66e5e23b56ba6bf70e22a34c3d7b6d0b3450b65b84"
"checksum smallvec 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "4f8266519bc1d17d0b5b16f6c21295625d562841c708f6376f49028a43e9c11e"
"checksum smallvec 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2e40af10aafe98b4d8294ae8388d8a5cd0707c65d364872efe72d063ec44bee0"

Просмотреть файл

@ -11,7 +11,7 @@ servo = ["gkrust-shared/servo"]
quantum_render = ["gkrust-shared/quantum_render"]
cubeb_pulse_rust = ["gkrust-shared/cubeb_pulse_rust"]
gecko_debug = ["gkrust-shared/gecko_debug"]
# simd-accel = ["gkrust-shared/simd-accel"]
simd-accel = ["gkrust-shared/simd-accel"]
no-static-ideograph-encoder-tables = ["gkrust-shared/no-static-ideograph-encoder-tables"]
# parallel-utf8 = ["gkrust-shared/parallel-utf8"]

7
toolkit/library/rust/Cargo.lock сгенерированный
Просмотреть файл

@ -299,6 +299,7 @@ version = "0.6.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -839,6 +840,11 @@ dependencies = [
"nodrop 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "simd"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "siphasher"
version = "0.2.1"
@ -1282,6 +1288,7 @@ dependencies = [
"checksum serde 0.9.9 (registry+https://github.com/rust-lang/crates.io-index)" = "05a67b8a53f885f4b6e3ed183806035819f9862474e747fe4488a6d63bcbfcb7"
"checksum serde_codegen_internals 0.14.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d52006899f910528a10631e5b727973fe668f3228109d1707ccf5bad5490b6e"
"checksum serde_derive 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)" = "f15ea24bd037b2d64646b4d934fa99c649be66e3f7b29fb595a5543b212b1452"
"checksum simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"
"checksum siphasher 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2ffc669b726f2bc9a3bcff66e5e23b56ba6bf70e22a34c3d7b6d0b3450b65b84"
"checksum smallvec 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "4f8266519bc1d17d0b5b16f6c21295625d562841c708f6376f49028a43e9c11e"
"checksum smallvec 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2e40af10aafe98b4d8294ae8388d8a5cd0707c65d364872efe72d063ec44bee0"

Просмотреть файл

@ -11,7 +11,7 @@ servo = ["gkrust-shared/servo"]
quantum_render = ["gkrust-shared/quantum_render"]
cubeb_pulse_rust = ["gkrust-shared/cubeb_pulse_rust"]
gecko_debug = ["gkrust-shared/gecko_debug"]
# simd-accel = ["gkrust-shared/simd-accel"]
simd-accel = ["gkrust-shared/simd-accel"]
no-static-ideograph-encoder-tables = ["gkrust-shared/no-static-ideograph-encoder-tables"]
# parallel-utf8 = ["gkrust-shared/parallel-utf8"]

Просмотреть файл

@ -20,4 +20,7 @@ if CONFIG['MOZ_BUILD_WEBRENDER']:
if CONFIG['MOZ_PULSEAUDIO']:
gkrust_features += ['cubeb_pulse_rust']
if CONFIG['MOZ_RUST_SIMD']:
gkrust_features += ['simd-accel']
gkrust_features += ['no-static-ideograph-encoder-tables']

Просмотреть файл

@ -795,6 +795,19 @@ set_config('MOZ_BUILD_WEBRENDER', webrender.build)
set_define('MOZ_BUILD_WEBRENDER', webrender.build)
set_config('MOZ_ENABLE_WEBRENDER', webrender.enable)
# SIMD acceleration for Rust code (currently just encoding_rs)
option('--enable-rust-simd', env='MOZ_RUST_SIMD',
help='Enable explicit SIMD in Rust code.')
@depends('--enable-rust-simd')
def rust_simd(value):
if value:
return True
set_config('MOZ_RUST_SIMD', rust_simd)
set_define('MOZ_RUST_SIMD', rust_simd)
# Printing
# ==============================================================
@depends(target)