зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1261841 part 4 - Add a configuration option for enabling explicit SIMD in Rust. r=froydnj.
MozReview-Commit-ID: ICifcJ9499a
This commit is contained in:
Родитель
d35d69cef8
Коммит
763d66dd51
|
@ -963,6 +963,11 @@ else
|
|||
environment_cleaner =
|
||||
endif
|
||||
|
||||
rust_unlock_unstable =
|
||||
ifdef MOZ_RUST_SIMD
|
||||
rust_unlock_unstable += RUSTC_BOOTSTRAP=1
|
||||
endif
|
||||
|
||||
# This function is intended to be called by:
|
||||
#
|
||||
# $(call CARGO_BUILD,EXTRA_ENV_VAR1=X EXTRA_ENV_VAR2=Y ...)
|
||||
|
@ -971,7 +976,7 @@ endif
|
|||
#
|
||||
# $(call CARGO_BUILD)
|
||||
define CARGO_BUILD
|
||||
env $(environment_cleaner) $(rustflags_override) \
|
||||
env $(environment_cleaner) $(rust_unlock_unstable) $(rustflags_override) \
|
||||
CARGO_TARGET_DIR=$(CARGO_TARGET_DIR) \
|
||||
RUSTC=$(RUSTC) \
|
||||
MOZ_SRC=$(topsrcdir) \
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
{"files":{".cargo-ok":"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",".gitignore":"695a11fe96751963cac851b3f556a8b922ae48ef73ee929cfe66164ea3db40cd",".travis.yml":"e2c720c3633b7671efce49147c62b12bcbf630d7c5d6fc65cd97620bfa4ddcea","Cargo.toml":"608aad04f17a524ee21048fa2ce9f656ae344e0473dd0e331dc954f0f9677c63","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6d3a9431e65e69c73a8923e6517b889d17549b23db406b9ec027710d16af701f","README.md":"249294a9a5f63c64c0f7fe4a607060f43f3507dce2378067aa59d25fb3ae681d","benches/mandelbrot.rs":"051b5199e66bca6cf7774e9024915fd4e1349ab39726a10a14e06b60d65d87a4","benches/matrix.rs":"048a21dacdb62365e0105d00d2c8cd6bd2396ac81134f2bff7eb4f7d095fb735","examples/axpy.rs":"4307626045d64ec08361c97c9c72c5dc8d361bdc88f64453b97ac0212041a1b2","examples/convert.rs":"8e658fde050f8a0d8b84ad7570446b10fcf544afbd551b940ca340474f324840","examples/dot-product.rs":"6fe2e007c147af5353804173a593c5b9d57dbccec156e1da37e9e32537363f91","examples/fannkuch-redux-nosimd.rs":"7b2fbde35e8666929d14d67328471cb0483d038a5325232f8db148b30865312b","examples/fannkuch-redux.rs":"ea21fdbd2274488a62cc984acad6e0b65d52f24fb4ff63b7057a3a667e9c8aae","examples/mandelbrot.rs":"8b8fdca1edac50e5a33e0e0592bd41eb75114f31839ccd40d485c61a9a664380","examples/matrix-inverse.rs":"a378d20ef20c2119bb10a86de27c92fec2c2f77f374e6bfd36707c9825a5fe92","examples/nbody-nosimd.rs":"2c8e0a7feacd202fdd65eeceb6420d6e9f43340b81f20a8e532704a587a2796b","examples/nbody.rs":"a864311affab262024479d6348ff51af43d809e9ad332ec30ea4aacceaa2eae1","examples/ops.rs":"1316f915d0afcfa98fdc4077e965ccccf6b4b21c433cbe487ff0cdc60df3cd39","examples/spectral-norm-nosimd.rs":"ffc8512ecde779078ea467f38f423a0ea623c63da7078193f9dd370200773f79","examples/spectral-norm.rs":"edb09c9d477f83939098cfb77a27cc298bc7a0c8a8e29cece0cccae0d70d890e","src/aarch64/mod.rs":"83f52775364c98de0cecb7e1509530c18972e932469f5f1522aa24a735d0fa37","src/aarch64/neon.rs":"1fe769979e07d8e2bc3c78ce116e05d735860744efe097a894cc9421153257fb","src/arm/mod.rs":"dcdd90bc0b39abaf86a0c8946d442b16313563fbae1ff03248628275c74d8617","src/arm/neon.rs":"51cc509856200e80f8e4cc2c982586e6d1cef593ec4537e153dce0cfe31d3428","src/common.rs":"62f4e7e0fefb52ad190d0f2191bc435ac4deab3f2bc70dc427f2a7f9ccb7856e","src/lib.rs":"25f0b39c038fa85af858318135dfd87865be26c33bb4bd1438aec96a1e68d8b5","src/sixty_four.rs":"510a9e00189a61e4f0a5beb7052d5dee37fc8261f94a2af45ef10327e0f3b7df","src/v256.rs":"2e328e49034876d535e0627c7a62191da2b4fb156a657614bf531a5fc75b1385","src/x86/avx.rs":"c66140abefca634b48eae307c3ec8cf5a40f2279b10e246a7e2ac602a2a2bb28","src/x86/avx2.rs":"efe3006b13a13261a3dec3d37dc1d8cb53950f3803c420069231803374949937","src/x86/mod.rs":"0acc5a5e2672e2a0fddc11065663be8b8fa2da87320ea291fa86ff8c2f33edf5","src/x86/sse2.rs":"5ceda75a401958a135fc9d851b22075314cdeed69fd483b6a7be4f11373f40da","src/x86/sse3.rs":"9bd01a4f08069ca4f445952e744d651efe887e3835b18872e757375f0d053bd2","src/x86/sse4_1.rs":"9ceb80dd70a7e7dfeef508cb935e1a2637175bc87a3b090f5dea691ff6aa0516","src/x86/sse4_2.rs":"c59321aed8decdce4d0d8570cff46aed02e1a8265647ef7702e9b180fc581254","src/x86/ssse3.rs":"2290f0269bae316b8e0491495645ee38a9bd73525c8572759c1328341c3bdb4c"},"package":"7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"}
|
|
@ -0,0 +1,3 @@
|
|||
perf.data*
|
||||
/target
|
||||
Cargo.lock
|
|
@ -0,0 +1,24 @@
|
|||
language: rust
|
||||
sudo: false
|
||||
|
||||
rust:
|
||||
- nightly
|
||||
|
||||
# load travis-cargo
|
||||
before_script:
|
||||
- |
|
||||
pip install 'travis-cargo<0.2' --user &&
|
||||
export PATH=$HOME/.local/bin:$PATH
|
||||
|
||||
# the main build
|
||||
script:
|
||||
- |
|
||||
travis-cargo build &&
|
||||
travis-cargo test &&
|
||||
travis-cargo bench &&
|
||||
travis-cargo doc -- --features doc
|
||||
|
||||
env:
|
||||
global:
|
||||
# override the default `--features unstable` used for the nightly branch (optional)
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE="with-serde"
|
|
@ -0,0 +1,26 @@
|
|||
[package]
|
||||
name = "simd"
|
||||
version = "0.2.0"
|
||||
authors = ["Huon Wilson <dbau.pp+github@gmail.com>"]
|
||||
|
||||
repository = "https://github.com/rust-lang-nursery/simd"
|
||||
documentation = "https://rust-lang-nursery.github.io/simd/doc/simd"
|
||||
license = "MIT/Apache-2.0"
|
||||
keywords = ["simd", "data-parallel"]
|
||||
readme = "README.md"
|
||||
|
||||
description = """
|
||||
`simd` offers limited cross-platform access to SIMD instructions on
|
||||
CPUs, as well as raw interfaces to platform-specific instructions.
|
||||
"""
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "0.8", optional = true }
|
||||
serde_derive = { version = "0.8", optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
cfg-if = "0.1"
|
||||
|
||||
[features]
|
||||
doc = []
|
||||
with-serde = ["serde", "serde_derive"]
|
|
@ -0,0 +1,201 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,25 @@
|
|||
Copyright (c) 2014 Huon Wilson
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
|
@ -0,0 +1,7 @@
|
|||
# `simd`
|
||||
|
||||
[![Build Status](https://travis-ci.org/rust-lang-nursery/simd.png)](https://travis-ci.org/rust-lang-nursery/simd)
|
||||
|
||||
`simd` offers a basic interface to the SIMD functionality of CPUs.
|
||||
|
||||
[Documentation](https://docs.rs/simd)
|
|
@ -0,0 +1,117 @@
|
|||
#![feature(test)]
|
||||
#![feature(cfg_target_feature)]
|
||||
|
||||
extern crate simd;
|
||||
extern crate test;
|
||||
|
||||
use test::black_box as bb;
|
||||
use test::Bencher as B;
|
||||
use simd::{f32x4, u32x4};
|
||||
#[cfg(any(target_feature = "avx", target_feature = "avx2"))]
|
||||
use simd::x86::avx::{f32x8, u32x8};
|
||||
|
||||
fn naive(c_x: f32, c_y: f32, max_iter: u32) -> u32 {
|
||||
let mut x = c_x;
|
||||
let mut y = c_y;
|
||||
let mut count = 0;
|
||||
while count < max_iter {
|
||||
let xy = x * y;
|
||||
let xx = x * x;
|
||||
let yy = y * y;
|
||||
let sum = xx + yy;
|
||||
if sum > 4.0 {
|
||||
break
|
||||
}
|
||||
count += 1;
|
||||
x = xx - yy + c_x;
|
||||
y = xy * 2.0 + c_y;
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
fn simd4(c_x: f32x4, c_y: f32x4, max_iter: u32) -> u32x4 {
|
||||
let mut x = c_x;
|
||||
let mut y = c_y;
|
||||
|
||||
let mut count = u32x4::splat(0);
|
||||
for _ in 0..max_iter as usize {
|
||||
let xy = x * y;
|
||||
let xx = x * x;
|
||||
let yy = y * y;
|
||||
let sum = xx + yy;
|
||||
let mask = sum.lt(f32x4::splat(4.0));
|
||||
|
||||
if !mask.any() { break }
|
||||
count = count + mask.to_i().select(u32x4::splat(1), u32x4::splat(0));
|
||||
|
||||
x = xx - yy + c_x;
|
||||
y = xy + xy + c_y;
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
#[cfg(target_feature = "avx")]
|
||||
fn simd8(c_x: f32x8, c_y: f32x8, max_iter: u32) -> u32x8 {
|
||||
let mut x = c_x;
|
||||
let mut y = c_y;
|
||||
|
||||
let mut count = u32x8::splat(0);
|
||||
for _ in 0..max_iter as usize {
|
||||
let xy = x * y;
|
||||
let xx = x * x;
|
||||
let yy = y * y;
|
||||
let sum = xx + yy;
|
||||
let mask = sum.lt(f32x8::splat(4.0));
|
||||
|
||||
if !mask.any() { break }
|
||||
count = count + mask.to_i().select(u32x8::splat(1), u32x8::splat(0));
|
||||
|
||||
x = xx - yy + c_x;
|
||||
y = xy + xy + c_y;
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
const SCALE: f32 = 3.0 / 100.0;
|
||||
const N: u32 = 100;
|
||||
#[bench]
|
||||
fn mandel_naive(b: &mut B) {
|
||||
b.iter(|| {
|
||||
for j in 0..100 {
|
||||
let y = -1.5 + (j as f32) * SCALE;
|
||||
for i in 0..100 {
|
||||
let x = -2.2 + (i as f32) * SCALE;
|
||||
bb(naive(x, y, N));
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
#[bench]
|
||||
fn mandel_simd4(b: &mut B) {
|
||||
let tweak = u32x4::new(0, 1, 2, 3);
|
||||
b.iter(|| {
|
||||
for j in 0..100 {
|
||||
let y = f32x4::splat(-1.5) + f32x4::splat(SCALE) * u32x4::splat(j).to_f32();
|
||||
for i in 0..25 {
|
||||
let i = u32x4::splat(i * 4) + tweak;
|
||||
let x = f32x4::splat(-2.2) + f32x4::splat(SCALE) * i.to_f32();
|
||||
bb(simd4(x, y, N));
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
#[cfg(any(target_feature = "avx", target_feature = "avx2"))]
|
||||
#[bench]
|
||||
fn mandel_simd8(b: &mut B) {
|
||||
let tweak = u32x8::new(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
b.iter(|| {
|
||||
for j in 0..100 {
|
||||
let y = f32x8::splat(-1.5) + f32x8::splat(SCALE) * u32x8::splat(j).to_f32();
|
||||
for i in 0..13 { // 100 not divisible by 8 :(
|
||||
let i = u32x8::splat(i * 8) + tweak;
|
||||
let x = f32x8::splat(-2.2) + f32x8::splat(SCALE) * i.to_f32();
|
||||
bb(simd8(x, y, N));
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
|
@ -0,0 +1,485 @@
|
|||
#![feature(test)]
|
||||
#![feature(cfg_target_feature)]
|
||||
extern crate test;
|
||||
extern crate simd;
|
||||
|
||||
use test::black_box as bb;
|
||||
use test::Bencher as B;
|
||||
use simd::f32x4;
|
||||
#[cfg(target_feature = "avx")]
|
||||
use simd::x86::avx::{f32x8, f64x4};
|
||||
// #[cfg(target_feature = "avx2")]
|
||||
// use simd::x86::avx2::Avx2F32x8;
|
||||
|
||||
|
||||
#[bench]
|
||||
fn multiply_naive(b: &mut B) {
|
||||
let x = [[1.0_f32; 4]; 4];
|
||||
let y = [[2.0; 4]; 4];
|
||||
b.iter(|| {
|
||||
for _ in 0..100 {
|
||||
let (x, y) = bb((&x, &y));
|
||||
|
||||
bb(&[[x[0][0] * y[0][0] + x[1][0] * y[0][1] + x[2][0] * y[0][2] + x[3][0] * y[0][3],
|
||||
x[0][1] * y[0][0] + x[1][1] * y[0][1] + x[2][1] * y[0][2] + x[3][1] * y[0][3],
|
||||
x[0][2] * y[0][0] + x[1][2] * y[0][1] + x[2][2] * y[0][2] + x[3][2] * y[0][3],
|
||||
x[0][3] * y[0][0] + x[1][3] * y[0][1] + x[2][3] * y[0][2] + x[3][3] * y[0][3]],
|
||||
[x[0][0] * y[1][0] + x[1][0] * y[1][1] + x[2][0] * y[1][2] + x[3][0] * y[1][3],
|
||||
x[0][1] * y[1][0] + x[1][1] * y[1][1] + x[2][1] * y[1][2] + x[3][1] * y[1][3],
|
||||
x[0][2] * y[1][0] + x[1][2] * y[1][1] + x[2][2] * y[1][2] + x[3][2] * y[1][3],
|
||||
x[0][3] * y[1][0] + x[1][3] * y[1][1] + x[2][3] * y[1][2] + x[3][3] * y[1][3]],
|
||||
[x[0][0] * y[2][0] + x[1][0] * y[2][1] + x[2][0] * y[2][2] + x[3][0] * y[2][3],
|
||||
x[0][1] * y[2][0] + x[1][1] * y[2][1] + x[2][1] * y[2][2] + x[3][1] * y[2][3],
|
||||
x[0][2] * y[2][0] + x[1][2] * y[2][1] + x[2][2] * y[2][2] + x[3][2] * y[2][3],
|
||||
x[0][3] * y[2][0] + x[1][3] * y[2][1] + x[2][3] * y[2][2] + x[3][3] * y[2][3]],
|
||||
[x[0][0] * y[3][0] + x[1][0] * y[3][1] + x[2][0] * y[3][2] + x[3][0] * y[3][3],
|
||||
x[0][1] * y[3][0] + x[1][1] * y[3][1] + x[2][1] * y[3][2] + x[3][1] * y[3][3],
|
||||
x[0][2] * y[3][0] + x[1][2] * y[3][1] + x[2][2] * y[3][2] + x[3][2] * y[3][3],
|
||||
x[0][3] * y[3][0] + x[1][3] * y[3][1] + x[2][3] * y[3][2] + x[3][3] * y[3][3]],
|
||||
]);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn multiply_simd4_32(b: &mut B) {
|
||||
let x = [f32x4::splat(1.0_f32); 4];
|
||||
let y = [f32x4::splat(2.0); 4];
|
||||
b.iter(|| {
|
||||
for _ in 0..100 {
|
||||
let (x, y) = bb((&x, &y));
|
||||
|
||||
let y0 = y[0];
|
||||
let y1 = y[1];
|
||||
let y2 = y[2];
|
||||
let y3 = y[3];
|
||||
bb(&[f32x4::splat(y0.extract(0)) * x[0] +
|
||||
f32x4::splat(y0.extract(1)) * x[1] +
|
||||
f32x4::splat(y0.extract(2)) * x[2] +
|
||||
f32x4::splat(y0.extract(3)) * x[3],
|
||||
f32x4::splat(y1.extract(0)) * x[0] +
|
||||
f32x4::splat(y1.extract(1)) * x[1] +
|
||||
f32x4::splat(y1.extract(2)) * x[2] +
|
||||
f32x4::splat(y1.extract(3)) * x[3],
|
||||
f32x4::splat(y2.extract(0)) * x[0] +
|
||||
f32x4::splat(y2.extract(1)) * x[1] +
|
||||
f32x4::splat(y2.extract(2)) * x[2] +
|
||||
f32x4::splat(y2.extract(3)) * x[3],
|
||||
f32x4::splat(y3.extract(0)) * x[0] +
|
||||
f32x4::splat(y3.extract(1)) * x[1] +
|
||||
f32x4::splat(y3.extract(2)) * x[2] +
|
||||
f32x4::splat(y3.extract(3)) * x[3],
|
||||
]);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(target_feature = "avx")]
|
||||
#[bench]
|
||||
fn multiply_simd4_64(b: &mut B) {
|
||||
let x = [f64x4::splat(1.0_f64); 4];
|
||||
let y = [f64x4::splat(2.0); 4];
|
||||
b.iter(|| {
|
||||
for _ in 0..100 {
|
||||
let (x, y) = bb((&x, &y));
|
||||
|
||||
let y0 = y[0];
|
||||
let y1 = y[1];
|
||||
let y2 = y[2];
|
||||
let y3 = y[3];
|
||||
bb(&[f64x4::splat(y0.extract(0)) * x[0] +
|
||||
f64x4::splat(y0.extract(1)) * x[1] +
|
||||
f64x4::splat(y0.extract(2)) * x[2] +
|
||||
f64x4::splat(y0.extract(3)) * x[3],
|
||||
f64x4::splat(y1.extract(0)) * x[0] +
|
||||
f64x4::splat(y1.extract(1)) * x[1] +
|
||||
f64x4::splat(y1.extract(2)) * x[2] +
|
||||
f64x4::splat(y1.extract(3)) * x[3],
|
||||
f64x4::splat(y2.extract(0)) * x[0] +
|
||||
f64x4::splat(y2.extract(1)) * x[1] +
|
||||
f64x4::splat(y2.extract(2)) * x[2] +
|
||||
f64x4::splat(y2.extract(3)) * x[3],
|
||||
f64x4::splat(y3.extract(0)) * x[0] +
|
||||
f64x4::splat(y3.extract(1)) * x[1] +
|
||||
f64x4::splat(y3.extract(2)) * x[2] +
|
||||
f64x4::splat(y3.extract(3)) * x[3],
|
||||
]);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn inverse_naive(b: &mut B) {
|
||||
let mut x = [[0_f32; 4]; 4];
|
||||
for i in 0..4 { x[i][i] = 1.0 }
|
||||
|
||||
b.iter(|| {
|
||||
for _ in 0..100 {
|
||||
let x = bb(&x);
|
||||
|
||||
let mut t = [[0_f32; 4]; 4];
|
||||
for i in 0..4 {
|
||||
t[0][i] = x[i][0];
|
||||
t[1][i] = x[i][1];
|
||||
t[2][i] = x[i][2];
|
||||
t[3][i] = x[i][3];
|
||||
}
|
||||
|
||||
let _0 = t[2][2] * t[3][3];
|
||||
let _1 = t[2][3] * t[3][2];
|
||||
let _2 = t[2][1] * t[3][3];
|
||||
let _3 = t[2][3] * t[3][1];
|
||||
let _4 = t[2][1] * t[3][2];
|
||||
let _5 = t[2][2] * t[3][1];
|
||||
let _6 = t[2][0] * t[3][3];
|
||||
let _7 = t[2][3] * t[3][0];
|
||||
let _8 = t[2][0] * t[3][2];
|
||||
let _9 = t[2][2] * t[3][0];
|
||||
let _10 = t[2][0] * t[3][1];
|
||||
let _11 = t[2][1] * t[3][0];
|
||||
|
||||
let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
|
||||
(_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
|
||||
let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
|
||||
(_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
|
||||
let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
|
||||
(_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
|
||||
let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
|
||||
(_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
|
||||
let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
|
||||
(_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
|
||||
let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
|
||||
(_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
|
||||
let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
|
||||
(_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
|
||||
let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
|
||||
(_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);
|
||||
|
||||
let _0 = t[0][2] * t[1][3];
|
||||
let _1 = t[0][3] * t[1][2];
|
||||
let _2 = t[0][1] * t[1][3];
|
||||
let _3 = t[0][3] * t[1][1];
|
||||
let _4 = t[0][1] * t[1][2];
|
||||
let _5 = t[0][2] * t[1][1];
|
||||
let _6 = t[0][0] * t[1][3];
|
||||
let _7 = t[0][3] * t[1][0];
|
||||
let _8 = t[0][0] * t[1][2];
|
||||
let _9 = t[0][2] * t[1][0];
|
||||
let _10 = t[0][0] * t[1][1];
|
||||
let _11 = t[0][1] * t[1][0];
|
||||
|
||||
let d20 = _0*t[3][1] + _3*t[3][2] + _4*t[3][3]-
|
||||
(_1*t[3][1] + _2*t[3][2] + _5*t[3][3]);
|
||||
let d21 = _1*t[3][0] + _6*t[3][2] + _9*t[3][3]-
|
||||
(_0*t[3][0] + _7*t[3][2] + _8*t[3][3]);
|
||||
let d22 = _2*t[3][0] + _7*t[3][1] + _10*t[3][3]-
|
||||
(_3*t[3][0] + _6*t[3][1] + _11*t[3][3]);
|
||||
let d23 = _5*t[3][0] + _8*t[3][1] + _11*t[3][2]-
|
||||
(_4*t[3][0] + _9*t[3][1] + _10*t[3][2]);
|
||||
let d30 = _2*t[2][2] + _5*t[2][3] + _1*t[2][1]-
|
||||
(_4*t[2][3] + _0*t[2][1] + _3*t[2][2]);
|
||||
let d31 = _8*t[2][3] + _0*t[2][0] + _7*t[2][2]-
|
||||
(_6*t[2][2] + _9*t[2][3] + _1*t[2][0]);
|
||||
let d32 = _6*t[2][1] + _11*t[2][3] + _3*t[2][0]-
|
||||
(_10*t[2][3] + _2*t[2][0] + _7*t[2][1]);
|
||||
let d33 = _10*t[2][2] + _4*t[2][0] + _9*t[2][1]-
|
||||
(_8*t[2][1] + _11*t[2][2] + _5*t[2][0]);
|
||||
|
||||
let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;
|
||||
|
||||
let det = 1.0 / det;
|
||||
let mut ret = [[d00, d01, d02, d03],
|
||||
[d10, d11, d12, d13],
|
||||
[d20, d21, d22, d23],
|
||||
[d30, d31, d32, d33]];
|
||||
for i in 0..4 {
|
||||
for j in 0..4 {
|
||||
ret[i][j] *= det;
|
||||
}
|
||||
}
|
||||
bb(&ret);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn inverse_simd4(b: &mut B) {
|
||||
let mut x = [f32x4::splat(0_f32); 4];
|
||||
for i in 0..4 { x[i] = x[i].replace(i as u32, 1.0); }
|
||||
|
||||
fn shuf0145(v: f32x4, w: f32x4) -> f32x4 {
|
||||
f32x4::new(v.extract(0), v.extract(1),
|
||||
w.extract(4 - 4), w.extract(5 - 4))
|
||||
}
|
||||
fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
|
||||
f32x4::new(v.extract(0), v.extract(2),
|
||||
w.extract(4 - 4), w.extract(6 - 4))
|
||||
}
|
||||
fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
|
||||
f32x4::new(v.extract(1), v.extract(3),
|
||||
w.extract(5 - 4), w.extract(7 - 4))
|
||||
}
|
||||
fn shuf2367(v: f32x4, w: f32x4) -> f32x4 {
|
||||
f32x4::new(v.extract(2), v.extract(3),
|
||||
w.extract(6 - 4), w.extract(7 - 4))
|
||||
}
|
||||
|
||||
fn swiz1032(v: f32x4) -> f32x4 {
|
||||
f32x4::new(v.extract(1), v.extract(0),
|
||||
v.extract(3), v.extract(2))
|
||||
}
|
||||
fn swiz2301(v: f32x4) -> f32x4 {
|
||||
f32x4::new(v.extract(2), v.extract(3),
|
||||
v.extract(0), v.extract(1))
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
for _ in 0..100 {
|
||||
let src0;
|
||||
let src1;
|
||||
let src2;
|
||||
let src3;
|
||||
let mut tmp1;
|
||||
let row0;
|
||||
let mut row1;
|
||||
let mut row2;
|
||||
let mut row3;
|
||||
let mut minor0;
|
||||
let mut minor1;
|
||||
let mut minor2;
|
||||
let mut minor3;
|
||||
let mut det;
|
||||
|
||||
let x = bb(&x);
|
||||
src0 = x[0];
|
||||
src1 = x[1];
|
||||
src2 = x[2];
|
||||
src3 = x[3];
|
||||
|
||||
tmp1 = shuf0145(src0, src1);
|
||||
row1 = shuf0145(src2, src3);
|
||||
row0 = shuf0246(tmp1, row1);
|
||||
row1 = shuf1357(row1, tmp1);
|
||||
|
||||
tmp1 = shuf2367(src0, src1);
|
||||
row3 = shuf2367(src2, src3);
|
||||
row2 = shuf0246(tmp1, row3);
|
||||
row3 = shuf0246(row3, tmp1);
|
||||
|
||||
|
||||
tmp1 = row2 * row3;
|
||||
tmp1 = swiz1032(tmp1);
|
||||
minor0 = row1 * tmp1;
|
||||
minor1 = row0 * tmp1;
|
||||
tmp1 = swiz2301(tmp1);
|
||||
minor0 = (row1 * tmp1) - minor0;
|
||||
minor1 = (row0 * tmp1) - minor1;
|
||||
minor1 = swiz2301(minor1);
|
||||
|
||||
|
||||
tmp1 = row1 * row2;
|
||||
tmp1 = swiz1032(tmp1);
|
||||
minor0 = (row3 * tmp1) + minor0;
|
||||
minor3 = row0 * tmp1;
|
||||
tmp1 = swiz2301(tmp1);
|
||||
|
||||
minor0 = minor0 - row3 * tmp1;
|
||||
minor3 = row0 * tmp1 - minor3;
|
||||
minor3 = swiz2301(minor3);
|
||||
|
||||
|
||||
tmp1 = row3 * swiz2301(row1);
|
||||
tmp1 = swiz1032(tmp1);
|
||||
row2 = swiz2301(row2);
|
||||
minor0 = row2 * tmp1 + minor0;
|
||||
minor2 = row0 * tmp1;
|
||||
tmp1 = swiz2301(tmp1);
|
||||
minor0 = minor0 - row2 * tmp1;
|
||||
minor2 = row0 * tmp1 - minor2;
|
||||
minor2 = swiz2301(minor2);
|
||||
|
||||
|
||||
tmp1 = row0 * row1;
|
||||
tmp1 = swiz1032(tmp1);
|
||||
minor2 = minor2 + row3 * tmp1;
|
||||
minor3 = row2 * tmp1 - minor3;
|
||||
tmp1 = swiz2301(tmp1);
|
||||
minor2 = row3 * tmp1 - minor2;
|
||||
minor3 = minor3 - row2 * tmp1;
|
||||
|
||||
|
||||
|
||||
tmp1 = row0 * row3;
|
||||
tmp1 = swiz1032(tmp1);
|
||||
minor1 = minor1 - row2 * tmp1;
|
||||
minor2 = row1 * tmp1 + minor2;
|
||||
tmp1 = swiz2301(tmp1);
|
||||
minor1 = row2 * tmp1 + minor1;
|
||||
minor2 = minor2 - row1 * tmp1;
|
||||
|
||||
tmp1 = row0 * row2;
|
||||
tmp1 = swiz1032(tmp1);
|
||||
minor1 = row3 * tmp1 + minor1;
|
||||
minor3 = minor3 - row1 * tmp1;
|
||||
tmp1 = swiz2301(tmp1);
|
||||
minor1 = minor1 - row3 * tmp1;
|
||||
minor3 = row1 * tmp1 + minor3;
|
||||
|
||||
det = row0 * minor0;
|
||||
det = swiz2301(det) + det;
|
||||
det = swiz1032(det) + det;
|
||||
//tmp1 = det.approx_reciprocal(); det = tmp1 * (f32x4::splat(2.0) - det * tmp1);
|
||||
det = f32x4::splat(1.0) / det;
|
||||
|
||||
bb(&[minor0 * det, minor1 * det, minor2 * det, minor3 * det]);
|
||||
}
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn transpose_naive(b: &mut B) {
|
||||
let x = [[0_f32; 4]; 4];
|
||||
b.iter(|| {
|
||||
for _ in 0..100 {
|
||||
let x = bb(&x);
|
||||
bb(&[[x[0][0], x[1][0], x[2][0], x[3][0]],
|
||||
[x[0][1], x[1][1], x[2][1], x[3][1]],
|
||||
[x[0][2], x[1][2], x[2][2], x[3][2]],
|
||||
[x[0][3], x[1][3], x[2][3], x[3][3]]]);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn transpose_simd4(b: &mut B) {
|
||||
let x = [f32x4::splat(0_f32); 4];
|
||||
|
||||
fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
|
||||
f32x4::new(v.extract(0), v.extract(2),
|
||||
w.extract(4 - 4), w.extract(6 - 4))
|
||||
}
|
||||
fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
|
||||
f32x4::new(v.extract(1), v.extract(3),
|
||||
w.extract(5 - 4), w.extract(7 - 4))
|
||||
}
|
||||
b.iter(|| {
|
||||
for _ in 0..100 {
|
||||
let x = bb(&x);
|
||||
let x0 = x[0];
|
||||
let x1 = x[1];
|
||||
let x2 = x[2];
|
||||
let x3 = x[3];
|
||||
|
||||
let a0 = shuf0246(x0, x1);
|
||||
let a1 = shuf0246(x2, x3);
|
||||
let a2 = shuf1357(x0, x1);
|
||||
let a3 = shuf1357(x2, x3);
|
||||
|
||||
let b0 = shuf0246(a0, a1);
|
||||
let b1 = shuf0246(a2, a3);
|
||||
let b2 = shuf1357(a0, a1);
|
||||
let b3 = shuf1357(a2, a3);
|
||||
bb(&[b0, b1, b2, b3]);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(target_feature = "avx")]
|
||||
#[bench]
|
||||
fn transpose_simd8_naive(b: &mut B) {
|
||||
let x = [f32x8::splat(0_f32); 2];
|
||||
|
||||
fn shuf0246(v: f32x8, w: f32x8) -> f32x8 {
|
||||
f32x8::new(v.extract(0), v.extract(2), v.extract(4), v.extract(6),
|
||||
w.extract(0), w.extract(2), w.extract(4), w.extract(6))
|
||||
}
|
||||
fn shuf1357(v: f32x8, w: f32x8) -> f32x8 {
|
||||
f32x8::new(v.extract(1), v.extract(3), v.extract(5), v.extract(7),
|
||||
w.extract(1), w.extract(3), w.extract(5), w.extract(7),)
|
||||
}
|
||||
b.iter(|| {
|
||||
for _ in 0..100 {
|
||||
let x = bb(&x);
|
||||
let x01 = x[0];
|
||||
let x23 = x[1];
|
||||
|
||||
let a01 = shuf0246(x01, x23);
|
||||
let a23 = shuf1357(x01, x23);
|
||||
|
||||
let b01 = shuf0246(a01, a23);
|
||||
let b23 = shuf1357(a01, a23);
|
||||
bb(&[b01, b23]);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(target_feature = "avx")]
|
||||
#[bench]
|
||||
fn transpose_simd8_avx2_vpermps(b: &mut B) {
|
||||
let x = [f32x8::splat(0_f32); 2];
|
||||
|
||||
// efficient on AVX2 using vpermps
|
||||
fn perm04152637(v: f32x8) -> f32x8 {
|
||||
// broken on rustc 1.7.0-nightly (1ddaf8bdf 2015-12-12)
|
||||
// v.permutevar(i32x8::new(0, 4, 1, 5, 2, 6, 3, 7))
|
||||
f32x8::new(v.extract(0), v.extract(4), v.extract(1), v.extract(5),
|
||||
v.extract(2), v.extract(6), v.extract(3), v.extract(7))
|
||||
}
|
||||
fn shuf_lo(v: f32x8, w: f32x8) -> f32x8 {
|
||||
f32x8::new(v.extract(0), v.extract(1), w.extract(0), w.extract(1),
|
||||
v.extract(4), v.extract(5), w.extract(4), w.extract(5),)
|
||||
}
|
||||
fn shuf_hi(v: f32x8, w: f32x8) -> f32x8 {
|
||||
f32x8::new(v.extract(2), v.extract(3), w.extract(2), w.extract(3),
|
||||
v.extract(6), v.extract(7), w.extract(6), w.extract(7),)
|
||||
}
|
||||
b.iter(|| {
|
||||
for _ in 0..100 {
|
||||
let x = bb(&x);
|
||||
let x01 = x[0];
|
||||
let x23 = x[1];
|
||||
|
||||
let a01 = perm04152637(x01);
|
||||
let a23 = perm04152637(x23);
|
||||
|
||||
let b01 = shuf_lo(a01, a23);
|
||||
let b23 = shuf_hi(a01, a23);
|
||||
bb(&[b01, b23]);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(target_feature = "avx")]
|
||||
#[bench]
|
||||
fn transpose_simd8_avx2_vpermpd(b: &mut B) {
|
||||
let x = [f32x8::splat(0_f32); 2];
|
||||
|
||||
// efficient on AVX2 using vpermpd
|
||||
fn perm01452367(v: f32x8) -> f32x8 {
|
||||
f32x8::new(v.extract(0), v.extract(1), v.extract(4), v.extract(5),
|
||||
v.extract(2), v.extract(3), v.extract(6), v.extract(7))
|
||||
}
|
||||
fn shuf_lo_ps(v: f32x8, w: f32x8) -> f32x8 {
|
||||
f32x8::new(v.extract(0), w.extract(0), v.extract(1), w.extract(1),
|
||||
v.extract(4), w.extract(4), v.extract(5), w.extract(5),)
|
||||
}
|
||||
fn shuf_hi_ps(v: f32x8, w: f32x8) -> f32x8 {
|
||||
f32x8::new(v.extract(2), w.extract(2), v.extract(3), w.extract(3),
|
||||
v.extract(6), w.extract(6), v.extract(7), w.extract(7),)
|
||||
}
|
||||
b.iter(|| {
|
||||
for _ in 0..100 {
|
||||
let x = bb(&x);
|
||||
let x01 = x[0];
|
||||
let x23 = x[1];
|
||||
|
||||
let a01 = perm01452367(x01);
|
||||
let a23 = perm01452367(x23);
|
||||
|
||||
let b01 = shuf_lo_ps(a01, a23);
|
||||
let b23 = shuf_hi_ps(a01, a23);
|
||||
bb(&[b01, b23]);
|
||||
}
|
||||
})
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
#![feature(cfg_target_feature)]
|
||||
extern crate simd;
|
||||
use simd::f32x4;
|
||||
#[cfg(target_feature = "avx")]
|
||||
use simd::x86::avx::f32x8;
|
||||
|
||||
#[inline(never)]
|
||||
pub fn axpy(z: &mut [f32], a: f32, x: &[f32], y: &[f32]) {
|
||||
assert_eq!(x.len(), y.len());
|
||||
assert_eq!(x.len(), z.len());
|
||||
|
||||
let len = std::cmp::min(std::cmp::min(x.len(), y.len()), z.len());
|
||||
|
||||
let mut i = 0;
|
||||
while i < len & !3 {
|
||||
let x = f32x4::load(x, i);
|
||||
let y = f32x4::load(y, i);
|
||||
(f32x4::splat(a) * x + y).store(z, i);
|
||||
i += 4
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_feature = "avx")]
|
||||
#[inline(never)]
|
||||
pub fn axpy8(z: &mut [f32], a: f32, x: &[f32], y: &[f32]) {
|
||||
assert_eq!(x.len(), y.len());
|
||||
assert_eq!(x.len(), z.len());
|
||||
|
||||
let len = std::cmp::min(std::cmp::min(x.len(), y.len()), z.len());
|
||||
|
||||
let mut i = 0;
|
||||
while i < len & !7 {
|
||||
let x = f32x8::load(x, i);
|
||||
let y = f32x8::load(y, i);
|
||||
(f32x8::splat(a) * x + y).store(z, i);
|
||||
i += 8
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_feature = "avx"))]
|
||||
pub fn axpy8(_: &mut [f32], _: f32, _: &[f32], _: &[f32]) {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
|
||||
fn main() {
|
||||
let mut z = vec![0.; 4];
|
||||
axpy(&mut z, 2., &[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]);
|
||||
println!("{:?}", z);
|
||||
let mut z = vec![0.; 8];
|
||||
axpy(&mut z, 3., &[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
|
||||
&[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]);
|
||||
println!("{:?}", z);
|
||||
|
||||
if cfg!(target_feature = "avx") {
|
||||
let mut z = vec![0.; 4];
|
||||
axpy8(&mut z, 2., &[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]);
|
||||
println!("{:?}", z);
|
||||
let mut z = vec![0.; 8];
|
||||
axpy8(&mut z, 3., &[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
|
||||
&[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]);
|
||||
println!("{:?}", z);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
extern crate simd;
|
||||
use simd::f32x4;
|
||||
|
||||
#[inline(never)]
|
||||
pub fn convert_scalar(x: &mut [i32], y: &[f32]) {
|
||||
assert_eq!(x.len(), y.len());
|
||||
|
||||
let mut i = 0;
|
||||
while i < x.len() & !3 {
|
||||
x[i] = y[i] as i32;
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub fn convert(x: &mut [i32], y: &[f32]) {
|
||||
assert_eq!(x.len(), y.len());
|
||||
|
||||
let mut i = 0;
|
||||
while i < x.len() & !3 {
|
||||
let v = f32x4::load(y, i);
|
||||
v.to_i32().store(x, i);
|
||||
i += 4
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let x = &mut [0; 12];
|
||||
let y = [1.0; 12];
|
||||
convert(x, &y);
|
||||
convert_scalar(x, &y);
|
||||
println!("{:?}", x);
|
||||
let x = &mut [0; 16];
|
||||
let y = [1.0; 16];
|
||||
convert(x, &y);
|
||||
convert_scalar(x, &y);
|
||||
println!("{:?}", x);
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
#![feature(cfg_target_feature)]
|
||||
extern crate simd;
|
||||
use simd::f32x4;
|
||||
#[cfg(target_feature = "avx")]
|
||||
use simd::x86::avx::{f32x8, LowHigh128};
|
||||
|
||||
#[inline(never)]
|
||||
pub fn dot(x: &[f32], y: &[f32]) -> f32 {
|
||||
assert_eq!(x.len(), y.len());
|
||||
|
||||
let len = std::cmp::min(x.len(), y.len());
|
||||
|
||||
let mut sum = f32x4::splat(0.0);
|
||||
let mut i = 0;
|
||||
while i < len & !3 {
|
||||
let x = f32x4::load(x, i);
|
||||
let y = f32x4::load(y, i);
|
||||
sum = sum + x * y;
|
||||
i += 4
|
||||
}
|
||||
sum.extract(0) + sum.extract(1) + sum.extract(2) + sum.extract(3)
|
||||
}
|
||||
|
||||
#[cfg(target_feature = "avx")]
|
||||
#[inline(never)]
|
||||
pub fn dot8(x: &[f32], y: &[f32]) -> f32 {
|
||||
assert_eq!(x.len(), y.len());
|
||||
|
||||
let len = std::cmp::min(x.len(), y.len());
|
||||
|
||||
let mut sum = f32x8::splat(0.0);
|
||||
let mut i = 0;
|
||||
while i < len & !7 {
|
||||
let x = f32x8::load(x, i);
|
||||
let y = f32x8::load(y, i);
|
||||
sum = sum + x * y;
|
||||
i += 8
|
||||
}
|
||||
let sum = sum.low() + sum.high();
|
||||
sum.extract(0) + sum.extract(1) + sum.extract(2) + sum.extract(3)
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_feature = "avx"))]
|
||||
pub fn dot8(_: &[f32], _: &[f32]) -> f32 {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
|
||||
fn main() {
|
||||
println!("{}", dot(&[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]));
|
||||
println!("{}", dot(&[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
|
||||
&[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]));
|
||||
|
||||
if cfg!(target_feature = "avx") {
|
||||
println!("{}", dot8(&[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]));
|
||||
println!("{}", dot8(&[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
|
||||
&[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
// The Computer Language Benchmarks Game
|
||||
// http://benchmarksgame.alioth.debian.org/
|
||||
//
|
||||
// contributed by the Rust Project Developers
|
||||
// contributed by TeXitoi
|
||||
|
||||
use std::{cmp, mem};
|
||||
use std::thread;
|
||||
|
||||
fn rotate(x: &mut [i32]) {
|
||||
let mut prev = x[0];
|
||||
for place in x.iter_mut().rev() {
|
||||
prev = mem::replace(place, prev)
|
||||
}
|
||||
}
|
||||
|
||||
fn next_permutation(perm: &mut [i32], count: &mut [i32]) {
|
||||
for i in 1..perm.len() {
|
||||
rotate(&mut perm[.. i + 1]);
|
||||
let count_i = &mut count[i];
|
||||
if *count_i >= i as i32 {
|
||||
*count_i = 0;
|
||||
} else {
|
||||
*count_i += 1;
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct P {
|
||||
p: [i32; 16],
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct Perm {
|
||||
cnt: [i32; 16],
|
||||
fact: [u32; 16],
|
||||
n: u32,
|
||||
permcount: u32,
|
||||
perm: P,
|
||||
}
|
||||
|
||||
impl Perm {
|
||||
fn new(n: u32) -> Perm {
|
||||
let mut fact = [1; 16];
|
||||
for i in 1 .. n as usize + 1 {
|
||||
fact[i] = fact[i - 1] * i as u32;
|
||||
}
|
||||
Perm {
|
||||
cnt: [0; 16],
|
||||
fact: fact,
|
||||
n: n,
|
||||
permcount: 0,
|
||||
perm: P { p: [0; 16 ] }
|
||||
}
|
||||
}
|
||||
|
||||
fn get(&mut self, mut idx: i32) -> P {
|
||||
let mut pp = [0u8; 16];
|
||||
self.permcount = idx as u32;
|
||||
for (i, place) in self.perm.p.iter_mut().enumerate() {
|
||||
*place = i as i32 + 1;
|
||||
}
|
||||
|
||||
for i in (1 .. self.n as usize).rev() {
|
||||
let d = idx / self.fact[i] as i32;
|
||||
self.cnt[i] = d;
|
||||
idx %= self.fact[i] as i32;
|
||||
for (place, val) in pp.iter_mut().zip(self.perm.p[..(i+1)].iter()) {
|
||||
*place = (*val) as u8
|
||||
}
|
||||
|
||||
let d = d as usize;
|
||||
for j in 0 .. i + 1 {
|
||||
self.perm.p[j] = if j + d <= i {pp[j + d]} else {pp[j+d-i-1]} as i32;
|
||||
}
|
||||
}
|
||||
|
||||
self.perm
|
||||
}
|
||||
|
||||
fn count(&self) -> u32 { self.permcount }
|
||||
fn max(&self) -> u32 { self.fact[self.n as usize] }
|
||||
|
||||
fn next(&mut self) -> P {
|
||||
next_permutation(&mut self.perm.p, &mut self.cnt);
|
||||
self.permcount += 1;
|
||||
|
||||
self.perm
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn reverse(tperm: &mut [i32], k: usize) {
|
||||
tperm[..k].reverse()
|
||||
}
|
||||
|
||||
fn work(mut perm: Perm, n: usize, max: usize) -> (i32, i32) {
|
||||
let mut checksum = 0;
|
||||
let mut maxflips = 0;
|
||||
|
||||
let mut p = perm.get(n as i32);
|
||||
|
||||
while perm.count() < max as u32 {
|
||||
let mut flips = 0;
|
||||
|
||||
while p.p[0] != 1 {
|
||||
let k = p.p[0] as usize;
|
||||
reverse(&mut p.p, k);
|
||||
flips += 1;
|
||||
}
|
||||
|
||||
checksum += if perm.count() % 2 == 0 {flips} else {-flips};
|
||||
maxflips = cmp::max(maxflips, flips);
|
||||
|
||||
p = perm.next();
|
||||
}
|
||||
|
||||
(checksum, maxflips)
|
||||
}
|
||||
|
||||
fn fannkuch(n: i32) -> (i32, i32) {
|
||||
let perm = Perm::new(n as u32);
|
||||
|
||||
let n = 1;
|
||||
let mut futures = vec![];
|
||||
let k = perm.max() / n;
|
||||
|
||||
for j in (0..).map(|x| x * k).take_while(|&j| j < k * n) {
|
||||
let max = cmp::min(j+k, perm.max());
|
||||
|
||||
futures.push(thread::spawn(move|| {
|
||||
work(perm, j as usize, max as usize)
|
||||
}))
|
||||
}
|
||||
|
||||
let mut checksum = 0;
|
||||
let mut maxflips = 0;
|
||||
for fut in futures.into_iter() {
|
||||
let (cs, mf) = fut.join().unwrap();
|
||||
checksum += cs;
|
||||
maxflips = cmp::max(maxflips, mf);
|
||||
}
|
||||
(checksum, maxflips)
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let n = std::env::args_os().nth(1)
|
||||
.and_then(|s| s.into_string().ok())
|
||||
.and_then(|n| n.parse().ok())
|
||||
.unwrap_or(7);
|
||||
|
||||
let (checksum, maxflips) = fannkuch(n);
|
||||
println!("{}\nPfannkuchen({}) = {}", checksum, n, maxflips);
|
||||
}
|
|
@ -0,0 +1,233 @@
|
|||
#![feature(cfg_target_feature)]
|
||||
extern crate simd;
|
||||
#[macro_use] extern crate cfg_if;
|
||||
use simd::u8x16;
|
||||
|
||||
use std::{env, process};
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_arch = "aarch64")] {
|
||||
#[inline(always)]
|
||||
fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
|
||||
use simd::aarch64::neon::*;
|
||||
y.table_lookup_1(x)
|
||||
}
|
||||
} else if #[cfg(all(target_arch = "arm",
|
||||
target_feature = "neon"))] {
|
||||
#[inline(always)]
|
||||
fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
|
||||
use simd::arm::neon::*;
|
||||
#[inline(always)]
|
||||
fn split(x: u8x16) -> (u8x8, u8x8) {
|
||||
unsafe {std::mem::transmute(x)}
|
||||
}
|
||||
fn join(x: u8x8, y: u8x8) -> u8x16 {
|
||||
unsafe {std::mem::transmute((x, y))}
|
||||
}
|
||||
|
||||
let (t0, t1) = split(x);
|
||||
let (i0, i1) = split(y);
|
||||
join(i0.table_lookup_2(t0, t1),
|
||||
i1.table_lookup_2(t0, t1))
|
||||
}
|
||||
} else if #[cfg(target_feature = "ssse3")] {
|
||||
#[inline(always)]
|
||||
fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
|
||||
use simd::x86::ssse3::*;
|
||||
x.shuffle_bytes(y)
|
||||
}
|
||||
} else {
|
||||
// slow fallback, so tests work
|
||||
#[inline(always)]
|
||||
fn shuffle(x: u8x16, y: u8x16) -> u8x16 {
|
||||
u8x16::new(x.extract(y.extract(0) as u32),
|
||||
x.extract(y.extract(1) as u32),
|
||||
x.extract(y.extract(2) as u32),
|
||||
x.extract(y.extract(3) as u32),
|
||||
x.extract(y.extract(4) as u32),
|
||||
x.extract(y.extract(5) as u32),
|
||||
x.extract(y.extract(6) as u32),
|
||||
x.extract(y.extract(7) as u32),
|
||||
x.extract(y.extract(8) as u32),
|
||||
x.extract(y.extract(9) as u32),
|
||||
x.extract(y.extract(10) as u32),
|
||||
x.extract(y.extract(11) as u32),
|
||||
x.extract(y.extract(12) as u32),
|
||||
x.extract(y.extract(13) as u32),
|
||||
x.extract(y.extract(14) as u32),
|
||||
x.extract(y.extract(15) as u32))
|
||||
}
|
||||
}
|
||||
}
|
||||
struct State {
|
||||
s: [u8; 16],
|
||||
flip_masks: [u8x16; 16],
|
||||
rotate_masks: [u8x16; 16],
|
||||
|
||||
maxflips: i32,
|
||||
odd: u16,
|
||||
checksum: i32,
|
||||
}
|
||||
impl State {
|
||||
fn new() -> State {
|
||||
State {
|
||||
s: [0; 16],
|
||||
flip_masks: [u8x16::splat(0); 16],
|
||||
rotate_masks: [u8x16::splat(0); 16],
|
||||
|
||||
maxflips: 0,
|
||||
odd: 0,
|
||||
checksum: 0,
|
||||
}
|
||||
}
|
||||
#[inline(never)]
|
||||
fn rotate_sisd(&mut self, n: usize) {
|
||||
let c = self.s[0];
|
||||
for i in 1..(n + 1) {
|
||||
self.s[i - 1] = self.s[i];
|
||||
}
|
||||
self.s[n] = c;
|
||||
}
|
||||
#[inline(never)]
|
||||
fn popmasks(&mut self) {
|
||||
let mut mask = [0_u8; 16];
|
||||
for i in 0..16 {
|
||||
for j in 0..16 { mask[j] = j as u8; }
|
||||
|
||||
for x in 0..(i+1)/2 {
|
||||
mask.swap(x, i - x);
|
||||
}
|
||||
|
||||
self.flip_masks[i] = u8x16::load(&mask, 0);
|
||||
|
||||
for j in 0..16 { self.s[j] = j as u8; }
|
||||
self.rotate_sisd(i);
|
||||
self.rotate_masks[i] = self.load_s();
|
||||
}
|
||||
}
|
||||
fn rotate(&mut self, n: usize) {
|
||||
shuffle(self.load_s(), self.rotate_masks[n]).store(&mut self.s, 0)
|
||||
}
|
||||
|
||||
fn load_s(&self) -> u8x16 {
|
||||
u8x16::load(&self.s, 0)
|
||||
}
|
||||
|
||||
|
||||
#[inline(never)]
|
||||
fn tk(&mut self, n: usize) {
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
struct Perm {
|
||||
perm: u8x16,
|
||||
start: u8,
|
||||
odd: u16
|
||||
}
|
||||
|
||||
let mut perms = [Perm { perm: u8x16::splat(0), start: 0 , odd: 0 }; 60];
|
||||
|
||||
let mut i = 0;
|
||||
let mut c = [0_u8; 16];
|
||||
let mut perm_max = 0;
|
||||
|
||||
while i < n {
|
||||
while i < n && perm_max < 60 {
|
||||
self.rotate(i);
|
||||
if c[i] as usize >= i {
|
||||
c[i] = 0;
|
||||
i += 1;
|
||||
continue
|
||||
}
|
||||
|
||||
c[i] += 1;
|
||||
i = 1;
|
||||
self.odd = !self.odd;
|
||||
if self.s[0] != 0 {
|
||||
if self.s[self.s[0] as usize] != 0 {
|
||||
perms[perm_max].perm = self.load_s();
|
||||
perms[perm_max].start = self.s[0];
|
||||
perms[perm_max].odd = self.odd;
|
||||
perm_max += 1;
|
||||
} else {
|
||||
if self.maxflips == 0 { self.maxflips = 1 }
|
||||
self.checksum += if self.odd != 0 { -1 } else { 1 };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut k = 0;
|
||||
while k < std::cmp::max(1, perm_max) - 1 {
|
||||
let pk = &perms[k];
|
||||
let pk1 = &perms[k + 1];
|
||||
//println!("perm1 {:?}\nperm2 {:?}", pk.perm, pk1.perm);
|
||||
let mut perm1 = pk.perm;
|
||||
let mut perm2 = pk1.perm;
|
||||
|
||||
let mut f1 = 0;
|
||||
let mut f2 = 0;
|
||||
let mut toterm1 = pk.start;
|
||||
let mut toterm2 = pk1.start;
|
||||
|
||||
while toterm1 != 0 && toterm2 != 0 {
|
||||
perm1 = shuffle(perm1, self.flip_masks[toterm1 as usize]);
|
||||
perm2 = shuffle(perm2, self.flip_masks[toterm2 as usize]);
|
||||
toterm1 = perm1.extract(0);
|
||||
toterm2 = perm2.extract(0);
|
||||
|
||||
f1 += 1; f2 += 1;
|
||||
}
|
||||
while toterm1 != 0 {
|
||||
perm1 = shuffle(perm1, self.flip_masks[toterm1 as usize]);
|
||||
toterm1 = perm1.extract(0);
|
||||
f1 += 1;
|
||||
}
|
||||
while toterm2 != 0 {
|
||||
perm2 = shuffle(perm2, self.flip_masks[toterm2 as usize]);
|
||||
toterm2 = perm2.extract(0);
|
||||
f2 += 1;
|
||||
}
|
||||
|
||||
if f1 > self.maxflips { self.maxflips = f1 }
|
||||
if f2 > self.maxflips { self.maxflips = f2 }
|
||||
self.checksum += if pk.odd != 0 { -f1 } else { f1 };
|
||||
self.checksum += if pk1.odd != 0 { -f2 } else { f2 };
|
||||
|
||||
k += 2;
|
||||
}
|
||||
while k < perm_max {
|
||||
let pk = &perms[k];
|
||||
let mut perm = pk.perm;
|
||||
let mut f = 0;
|
||||
let mut toterm = pk.start;
|
||||
while toterm != 0 {
|
||||
perm = shuffle(perm, self.flip_masks[toterm as usize]);
|
||||
toterm = perm.extract(0);
|
||||
f += 1;
|
||||
}
|
||||
if f > self.maxflips { self.maxflips = f }
|
||||
self.checksum += if pk.odd != 0 { -f } else { f };
|
||||
k += 1
|
||||
}
|
||||
perm_max = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut state = State::new();
|
||||
state.popmasks();
|
||||
|
||||
let args = env::args().collect::<Vec<_>>();
|
||||
if args.len() < 2 {
|
||||
println!("usage: {} number", args[0]);
|
||||
process::exit(1)
|
||||
}
|
||||
let max_n = args[1].parse().unwrap();
|
||||
if max_n < 3 || max_n > 15 {
|
||||
println!("range: must be 3 <= n <= 14");
|
||||
process::exit(1);
|
||||
}
|
||||
for i in 0..max_n { state.s[i] = i as u8 }
|
||||
state.tk(max_n);
|
||||
|
||||
println!("{}\nPfannkuchen({}) = {}", state.checksum, max_n, state.maxflips);
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
#![feature(step_by, test)]
|
||||
|
||||
extern crate test;
|
||||
extern crate simd;
|
||||
use simd::{f32x4, u32x4};
|
||||
use std::io::prelude::*;
|
||||
|
||||
#[inline(never)]
|
||||
fn mandelbrot_naive(c_x: f32, c_y: f32, max_iter: u32) -> u32 {
|
||||
let mut x = c_x;
|
||||
let mut y = c_y;
|
||||
let mut count = 0;
|
||||
while count < max_iter {
|
||||
let xy = x * y;
|
||||
let xx = x * x;
|
||||
let yy = y * y;
|
||||
let sum = xx + yy;
|
||||
if sum > 4.0 {
|
||||
break
|
||||
}
|
||||
count += 1;
|
||||
x = xx - yy + c_x;
|
||||
y = xy * 2.0 + c_y;
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn mandelbrot_vector(c_x: f32x4, c_y: f32x4, max_iter: u32) -> u32x4 {
|
||||
let mut x = c_x;
|
||||
let mut y = c_y;
|
||||
|
||||
let mut count = u32x4::splat(0);
|
||||
for _ in 0..max_iter as usize {
|
||||
let xy = x * y;
|
||||
let xx = x * x;
|
||||
let yy = y * y;
|
||||
let sum = xx + yy;
|
||||
let mask = sum.lt(f32x4::splat(4.0));
|
||||
|
||||
if !mask.any() { break }
|
||||
count = count + mask.to_i().select(u32x4::splat(1),
|
||||
u32x4::splat(0));
|
||||
|
||||
x = xx - yy + c_x;
|
||||
y = xy + xy + c_y;
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
const COLOURS: &'static [(f32, f32, f32)] = &[(0.0, 7.0, 100.0),
|
||||
(32.0, 107.0, 203.0),
|
||||
(237.0, 255.0, 255.0),
|
||||
(255.0, 170.0, 0.0),
|
||||
(0.0, 2.0, 0.0)];
|
||||
const SCALE: f32 = 12.0;
|
||||
const LIMIT: u32 = 100;
|
||||
|
||||
#[inline(never)]
|
||||
fn output_one(buf: &mut [u8], val: u32) {
|
||||
let (r, g, b);
|
||||
if val == LIMIT {
|
||||
r = 0;
|
||||
g = 0;
|
||||
b = 0;
|
||||
} else {
|
||||
let val = (val as f32 % SCALE) * (COLOURS.len() as f32) / SCALE;
|
||||
let left = val as usize % COLOURS.len();
|
||||
let right = (left + 1) % COLOURS.len();
|
||||
|
||||
let p = val - left as f32;
|
||||
let (r1, g1, b1) = COLOURS[left];
|
||||
let (r2, g2, b2) = COLOURS[right];
|
||||
r = (r1 + (r2 - r1) * p) as u8;
|
||||
g = (g1 + (g2 - g1) * p) as u8;
|
||||
b = (b1 + (b2 - b1) * p) as u8;
|
||||
}
|
||||
buf[0] = r;
|
||||
buf[1] = g;
|
||||
buf[2] = b;
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut args = std::env::args();
|
||||
args.next();
|
||||
let width = args.next().unwrap().parse().unwrap();
|
||||
let height = args.next().unwrap().parse().unwrap();
|
||||
|
||||
let left = -2.2;
|
||||
let right = left + 3.0;
|
||||
let top = 1.0;
|
||||
let bottom = top - 2.0;
|
||||
|
||||
let width_step: f32 = (right - left) / width as f32;
|
||||
let height_step: f32 = (bottom - top) / height as f32;
|
||||
|
||||
let adjust = f32x4::splat(width_step) * f32x4::new(0., 1., 2., 3.);
|
||||
|
||||
println!("P6 {} {} 255", width, height);
|
||||
let mut line = vec![0; width * 3];
|
||||
|
||||
if args.next().is_none() {
|
||||
for i in 0..height {
|
||||
let y = f32x4::splat(top + height_step * i as f32);
|
||||
for j in (0..width).step_by(4) {
|
||||
let x = f32x4::splat(left + width_step * j as f32) + adjust;
|
||||
let ret = mandelbrot_vector(x, y, LIMIT);
|
||||
test::black_box(ret);
|
||||
for k in 0..4 { let val = ret.extract(k as u32); output_one(&mut line[3*(j + k)..3*(j + k + 1)], val); }
|
||||
}
|
||||
::std::io::stdout().write(&line).unwrap();
|
||||
}
|
||||
} else {
|
||||
for i in 0..height {
|
||||
let y = top + height_step * i as f32;
|
||||
for j in 0..width {
|
||||
let x = left + width_step * j as f32;
|
||||
let val = mandelbrot_naive(x, y, LIMIT);
|
||||
test::black_box(val);
|
||||
output_one(&mut line[3*j..3*(j + 1)], val);
|
||||
}
|
||||
::std::io::stdout().write(&line).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,280 @@
|
|||
extern crate simd;
|
||||
use simd::f32x4;
|
||||
|
||||
fn mul(x: &[f32x4; 4], y: &[f32x4; 4]) -> [f32x4; 4] {
|
||||
let y0 = y[0];
|
||||
let y1 = y[1];
|
||||
let y2 = y[2];
|
||||
let y3 = y[3];
|
||||
[f32x4::splat(y0.extract(0)) * x[0] +
|
||||
f32x4::splat(y0.extract(1)) * x[1] +
|
||||
f32x4::splat(y0.extract(2)) * x[2] +
|
||||
f32x4::splat(y0.extract(3)) * x[3],
|
||||
f32x4::splat(y1.extract(0)) * x[0] +
|
||||
f32x4::splat(y1.extract(1)) * x[1] +
|
||||
f32x4::splat(y1.extract(2)) * x[2] +
|
||||
f32x4::splat(y1.extract(3)) * x[3],
|
||||
f32x4::splat(y2.extract(0)) * x[0] +
|
||||
f32x4::splat(y2.extract(1)) * x[1] +
|
||||
f32x4::splat(y2.extract(2)) * x[2] +
|
||||
f32x4::splat(y2.extract(3)) * x[3],
|
||||
f32x4::splat(y3.extract(0)) * x[0] +
|
||||
f32x4::splat(y3.extract(1)) * x[1] +
|
||||
f32x4::splat(y3.extract(2)) * x[2] +
|
||||
f32x4::splat(y3.extract(3)) * x[3],
|
||||
]
|
||||
}
|
||||
|
||||
fn inverse_naive(x: &[[f32; 4]; 4]) -> [[f32; 4]; 4] {
|
||||
let mut t = [[0_f32; 4]; 4];
|
||||
for i in 0..4 {
|
||||
t[0][i] = x[i][0];
|
||||
t[1][i] = x[i][1];
|
||||
t[2][i] = x[i][2];
|
||||
t[3][i] = x[i][3];
|
||||
}
|
||||
println!("{:?}", t);
|
||||
|
||||
let _0 = t[2][2] * t[3][3];
|
||||
let _1 = t[2][3] * t[3][2];
|
||||
let _2 = t[2][1] * t[3][3];
|
||||
let _3 = t[2][3] * t[3][1];
|
||||
let _4 = t[2][1] * t[3][2];
|
||||
let _5 = t[2][2] * t[3][1];
|
||||
let _6 = t[2][0] * t[3][3];
|
||||
let _7 = t[2][3] * t[3][0];
|
||||
let _8 = t[2][0] * t[3][2];
|
||||
let _9 = t[2][2] * t[3][0];
|
||||
let _10 = t[2][0] * t[3][1];
|
||||
let _11 = t[2][1] * t[3][0];
|
||||
let v = [_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11];
|
||||
println!("{:?}", v);
|
||||
|
||||
let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
|
||||
(_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
|
||||
let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
|
||||
(_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
|
||||
let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
|
||||
(_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
|
||||
let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
|
||||
(_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
|
||||
let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
|
||||
(_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
|
||||
let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
|
||||
(_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
|
||||
let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
|
||||
(_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
|
||||
let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
|
||||
(_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);
|
||||
|
||||
println!("{:?}", [d00, d01, d02, d03, d10, d11, d12, d13]);
|
||||
|
||||
let _0 = t[0][2] * t[1][3];
|
||||
let _1 = t[0][3] * t[1][2];
|
||||
let _2 = t[0][1] * t[1][3];
|
||||
let _3 = t[0][3] * t[1][1];
|
||||
let _4 = t[0][1] * t[1][2];
|
||||
let _5 = t[0][2] * t[1][1];
|
||||
let _6 = t[0][0] * t[1][3];
|
||||
let _7 = t[0][3] * t[1][0];
|
||||
let _8 = t[0][0] * t[1][2];
|
||||
let _9 = t[0][2] * t[1][0];
|
||||
let _10 = t[0][0] * t[1][1];
|
||||
let _11 = t[0][1] * t[1][0];
|
||||
let v = [_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11];
|
||||
println!("{:?}", v);
|
||||
|
||||
let d20 = _0*t[3][1] + _3*t[3][2] + _4*t[3][3]-
|
||||
(_1*t[3][1] + _2*t[3][2] + _5*t[3][3]);
|
||||
let d21 = _1*t[3][0] + _6*t[3][2] + _9*t[3][3]-
|
||||
(_0*t[3][0] + _7*t[3][2] + _8*t[3][3]);
|
||||
let d22 = _2*t[3][0] + _7*t[3][1] + _10*t[3][3]-
|
||||
(_3*t[3][0] + _6*t[3][1] + _11*t[3][3]);
|
||||
let d23 = _5*t[3][0] + _8*t[3][1] + _11*t[3][2]-
|
||||
(_4*t[3][0] + _9*t[3][1] + _10*t[3][2]);
|
||||
let d30 = _2*t[2][2] + _5*t[2][3] + _1*t[2][1]-
|
||||
(_4*t[2][3] + _0*t[2][1] + _3*t[2][2]);
|
||||
let d31 = _8*t[2][3] + _0*t[2][0] + _7*t[2][2]-
|
||||
(_6*t[2][2] + _9*t[2][3] + _1*t[2][0]);
|
||||
let d32 = _6*t[2][1] + _11*t[2][3] + _3*t[2][0]-
|
||||
(_10*t[2][3] + _2*t[2][0] + _7*t[2][1]);
|
||||
let d33 = _10*t[2][2] + _4*t[2][0] + _9*t[2][1]-
|
||||
(_8*t[2][1] + _11*t[2][2] + _5*t[2][0]);
|
||||
|
||||
println!("{:?}", [d20, d21, d22, d23, d30, d31, d32, d33]);
|
||||
|
||||
let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;
|
||||
|
||||
let det = 1.0 / det;
|
||||
let mut ret = [[d00, d01, d02, d03],
|
||||
[d10, d11, d12, d13],
|
||||
[d20, d21, d22, d23],
|
||||
[d30, d31, d32, d33]];
|
||||
for i in 0..4 {
|
||||
for j in 0..4 {
|
||||
ret[i][j] *= det;
|
||||
}
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
fn inverse_simd4(x: &[f32x4; 4]) -> [f32x4; 4] {
|
||||
let src0 = x[0];
|
||||
let src1 = x[1];
|
||||
let src2 = x[2];
|
||||
let src3 = x[3];
|
||||
|
||||
let tmp1 = f32x4::new(src0.extract(0), src0.extract(1),
|
||||
src1.extract(4 - 4), src1.extract(5 - 4));
|
||||
let row1 = f32x4::new(src2.extract(0), src2.extract(1),
|
||||
src3.extract(4 - 4), src3.extract(5 - 4));
|
||||
let row0 = f32x4::new(tmp1.extract(0), tmp1.extract(2),
|
||||
row1.extract(4 - 4), row1.extract(6 - 4));
|
||||
let row1 = f32x4::new(row1.extract(1), row1.extract(3),
|
||||
tmp1.extract(5 - 4), tmp1.extract(7 - 4));
|
||||
|
||||
let tmp1 = f32x4::new(src0.extract(2), src0.extract(3),
|
||||
src1.extract(6 - 4), src1.extract(7 - 4));
|
||||
let row3 = f32x4::new(src2.extract(2), src2.extract(3),
|
||||
src3.extract(6 - 4), src3.extract(7 - 4));
|
||||
let row2 = f32x4::new(tmp1.extract(0), tmp1.extract(2),
|
||||
row3.extract(4 - 4), row3.extract(6 - 4));
|
||||
let row3 = f32x4::new(row3.extract(1), row3.extract(3),
|
||||
tmp1.extract(5 - 4), tmp1.extract(7 - 4));
|
||||
|
||||
|
||||
let tmp1 = row2 * row3;
|
||||
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
|
||||
tmp1.extract(3), tmp1.extract(2));
|
||||
let minor0 = row1 * tmp1;
|
||||
let minor1 = row0 * tmp1;
|
||||
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
|
||||
tmp1.extract(0), tmp1.extract(1));
|
||||
let minor0 = (row1 * tmp1) - minor0;
|
||||
let minor1 = (row0 * tmp1) - minor1;
|
||||
let minor1 = f32x4::new(minor1.extract(2), minor1.extract(3),
|
||||
minor1.extract(0), minor1.extract(1));
|
||||
//println!("{:?}", minor1);
|
||||
|
||||
|
||||
let tmp1 = row1 * row2;
|
||||
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
|
||||
tmp1.extract(3), tmp1.extract(2));
|
||||
let minor0 = (row3 * tmp1) + minor0;
|
||||
let minor3 = row0 * tmp1;
|
||||
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
|
||||
tmp1.extract(0), tmp1.extract(1));
|
||||
|
||||
let minor0 = minor0 - row3 * tmp1;
|
||||
let minor3 = row0 * tmp1 - minor3;
|
||||
let minor3 = f32x4::new(minor3.extract(2), minor3.extract(3),
|
||||
minor3.extract(0), minor3.extract(1));
|
||||
//println!("{:?}", minor1);
|
||||
|
||||
|
||||
let tmp1 = row3 * f32x4::new(row1.extract(2), row1.extract(3),
|
||||
row1.extract(0), row1.extract(1));
|
||||
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
|
||||
tmp1.extract(3), tmp1.extract(2));
|
||||
let row2 = f32x4::new(row2.extract(2), row2.extract(3),
|
||||
row2.extract(0), row2.extract(1));
|
||||
let minor0 = row2 * tmp1 + minor0;
|
||||
let minor2 = row0 * tmp1;
|
||||
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
|
||||
tmp1.extract(0), tmp1.extract(1));
|
||||
let minor0 = minor0 - row2 * tmp1;
|
||||
let minor2 = row0 * tmp1 - minor2;
|
||||
let minor2 = f32x4::new(minor2.extract(2), minor2.extract(3),
|
||||
minor2.extract(0), minor2.extract(1));
|
||||
//println!("{:?}", minor1);
|
||||
|
||||
|
||||
let tmp1 = row0 * row1;
|
||||
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
|
||||
tmp1.extract(3), tmp1.extract(2));
|
||||
let minor2 = minor2 + row3 * tmp1;
|
||||
let minor3 = row2 * tmp1 - minor3;
|
||||
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
|
||||
tmp1.extract(0), tmp1.extract(1));
|
||||
let minor2 = row3 * tmp1 - minor2;
|
||||
let minor3 = minor3 - row2 * tmp1;
|
||||
//println!("{:?}", minor1);
|
||||
|
||||
|
||||
|
||||
let tmp1 = row0 * row3;
|
||||
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
|
||||
tmp1.extract(3), tmp1.extract(2));
|
||||
let minor1 = minor1 - row2 * tmp1;
|
||||
let minor2 = row1 * tmp1 + minor2;
|
||||
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
|
||||
tmp1.extract(0), tmp1.extract(1));
|
||||
let minor1 = row2 * tmp1 + minor1;
|
||||
let minor2 = minor2 - row1 * tmp1;
|
||||
//println!("{:?}", minor1);
|
||||
|
||||
let tmp1 = row0 * row2;
|
||||
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
|
||||
tmp1.extract(3), tmp1.extract(2));
|
||||
let minor1 = row3 * tmp1 + minor1;
|
||||
let minor3 = minor3 - row1 * tmp1;
|
||||
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
|
||||
tmp1.extract(0), tmp1.extract(1));
|
||||
let minor1 = minor1 - row3 * tmp1;
|
||||
let minor3 = row1 * tmp1 + minor3;
|
||||
//println!("{:?}", minor1);
|
||||
|
||||
let det = row0 * minor0;
|
||||
let det = f32x4::new(det.extract(2), det.extract(3),
|
||||
det.extract(0), det.extract(1)) + det;
|
||||
let det = f32x4::new(det.extract(1), det.extract(0),
|
||||
det.extract(3), det.extract(2)) + det;
|
||||
let tmp1 = det.approx_reciprocal();
|
||||
let det = tmp1 + tmp1 - det * tmp1 * tmp1;
|
||||
|
||||
// let det = f32x4::splat(det.extract(0));
|
||||
|
||||
[minor0 * det, minor1 * det, minor2 * det, minor3 * det]
|
||||
}
|
||||
|
||||
fn p(x: &[f32x4; 4]) {
|
||||
for xx in x {
|
||||
for i in 0..4 {
|
||||
let v = xx.extract(i);
|
||||
if v == 0.0 {
|
||||
print!("{}{:6.2}", if i > 0 {", "} else {"|"}, "");
|
||||
} else {
|
||||
print!("{}{:6.2}", if i > 0 {", "} else {"|"}, xx.extract(i));
|
||||
}
|
||||
}
|
||||
println!(" |");
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let x = [f32x4::new(-100.0, 6.0, 100.0, 1.0),
|
||||
f32x4::new(3.0, 1.0, 0.0, 1.0),
|
||||
f32x4::new(2.0, 1.0, 1.0, 1.0),
|
||||
f32x4::new(-10.0, 1.0, 1.0, 1.0)];
|
||||
|
||||
/* let mut x_ = [[0.0; 4]; 4];
|
||||
for i in 0..4 {
|
||||
for j in 0..4 {
|
||||
x_[i][j] = x[i].extract(j as u32)
|
||||
}
|
||||
}
|
||||
|
||||
let ret = inverse_naive(&x_);
|
||||
let mut y = [f32x4::splat(0.0); 4];
|
||||
for i in 0..4 {
|
||||
for j in 0..4 {
|
||||
y[i] = y[i].replace(j as u32, ret[i][j])
|
||||
}
|
||||
}*/
|
||||
let y = inverse_simd4(&x);
|
||||
p(&x);
|
||||
println!("");
|
||||
p(&y);
|
||||
println!("");
|
||||
p(&mul(&x, &y))
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
// The Computer Language Benchmarks Game
|
||||
// http://benchmarksgame.alioth.debian.org/
|
||||
//
|
||||
// contributed by the Rust Project Developers
|
||||
// contributed by TeXitoi
|
||||
|
||||
const PI: f64 = 3.141592653589793;
|
||||
const SOLAR_MASS: f64 = 4.0 * PI * PI;
|
||||
const YEAR: f64 = 365.24;
|
||||
const N_BODIES: usize = 5;
|
||||
|
||||
static BODIES: [Planet;N_BODIES] = [
|
||||
// Sun
|
||||
Planet {
|
||||
x: 0.0, y: 0.0, z: 0.0,
|
||||
vx: 0.0, vy: 0.0, vz: 0.0,
|
||||
mass: SOLAR_MASS,
|
||||
},
|
||||
// Jupiter
|
||||
Planet {
|
||||
x: 4.84143144246472090e+00,
|
||||
y: -1.16032004402742839e+00,
|
||||
z: -1.03622044471123109e-01,
|
||||
vx: 1.66007664274403694e-03 * YEAR,
|
||||
vy: 7.69901118419740425e-03 * YEAR,
|
||||
vz: -6.90460016972063023e-05 * YEAR,
|
||||
mass: 9.54791938424326609e-04 * SOLAR_MASS,
|
||||
},
|
||||
// Saturn
|
||||
Planet {
|
||||
x: 8.34336671824457987e+00,
|
||||
y: 4.12479856412430479e+00,
|
||||
z: -4.03523417114321381e-01,
|
||||
vx: -2.76742510726862411e-03 * YEAR,
|
||||
vy: 4.99852801234917238e-03 * YEAR,
|
||||
vz: 2.30417297573763929e-05 * YEAR,
|
||||
mass: 2.85885980666130812e-04 * SOLAR_MASS,
|
||||
},
|
||||
// Uranus
|
||||
Planet {
|
||||
x: 1.28943695621391310e+01,
|
||||
y: -1.51111514016986312e+01,
|
||||
z: -2.23307578892655734e-01,
|
||||
vx: 2.96460137564761618e-03 * YEAR,
|
||||
vy: 2.37847173959480950e-03 * YEAR,
|
||||
vz: -2.96589568540237556e-05 * YEAR,
|
||||
mass: 4.36624404335156298e-05 * SOLAR_MASS,
|
||||
},
|
||||
// Neptune
|
||||
Planet {
|
||||
x: 1.53796971148509165e+01,
|
||||
y: -2.59193146099879641e+01,
|
||||
z: 1.79258772950371181e-01,
|
||||
vx: 2.68067772490389322e-03 * YEAR,
|
||||
vy: 1.62824170038242295e-03 * YEAR,
|
||||
vz: -9.51592254519715870e-05 * YEAR,
|
||||
mass: 5.15138902046611451e-05 * SOLAR_MASS,
|
||||
},
|
||||
];
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct Planet {
|
||||
x: f64, y: f64, z: f64,
|
||||
vx: f64, vy: f64, vz: f64,
|
||||
mass: f64,
|
||||
}
|
||||
|
||||
fn advance(bodies: &mut [Planet;N_BODIES], dt: f64, steps: i32) {
|
||||
for _ in (0..steps) {
|
||||
let mut b_slice: &mut [_] = bodies;
|
||||
loop {
|
||||
let bi = match shift_mut_ref(&mut b_slice) {
|
||||
Some(bi) => bi,
|
||||
None => break
|
||||
};
|
||||
for bj in b_slice.iter_mut() {
|
||||
let dx = bi.x - bj.x;
|
||||
let dy = bi.y - bj.y;
|
||||
let dz = bi.z - bj.z;
|
||||
|
||||
let d2 = dx * dx + dy * dy + dz * dz;
|
||||
let mag = dt / (d2 * d2.sqrt());
|
||||
|
||||
let massj_mag = bj.mass * mag;
|
||||
bi.vx -= dx * massj_mag;
|
||||
bi.vy -= dy * massj_mag;
|
||||
bi.vz -= dz * massj_mag;
|
||||
|
||||
let massi_mag = bi.mass * mag;
|
||||
bj.vx += dx * massi_mag;
|
||||
bj.vy += dy * massi_mag;
|
||||
bj.vz += dz * massi_mag;
|
||||
}
|
||||
bi.x += dt * bi.vx;
|
||||
bi.y += dt * bi.vy;
|
||||
bi.z += dt * bi.vz;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn energy(bodies: &[Planet;N_BODIES]) -> f64 {
|
||||
let mut e = 0.0;
|
||||
let mut bodies = bodies.iter();
|
||||
loop {
|
||||
let bi = match bodies.next() {
|
||||
Some(bi) => bi,
|
||||
None => break
|
||||
};
|
||||
e += (bi.vx * bi.vx + bi.vy * bi.vy + bi.vz * bi.vz) * bi.mass / 2.0;
|
||||
for bj in bodies.clone() {
|
||||
let dx = bi.x - bj.x;
|
||||
let dy = bi.y - bj.y;
|
||||
let dz = bi.z - bj.z;
|
||||
let dist = (dx * dx + dy * dy + dz * dz).sqrt();
|
||||
e -= bi.mass * bj.mass / dist;
|
||||
}
|
||||
}
|
||||
e
|
||||
}
|
||||
|
||||
fn offset_momentum(bodies: &mut [Planet;N_BODIES]) {
|
||||
let mut px = 0.0;
|
||||
let mut py = 0.0;
|
||||
let mut pz = 0.0;
|
||||
for bi in bodies.iter() {
|
||||
px += bi.vx * bi.mass;
|
||||
py += bi.vy * bi.mass;
|
||||
pz += bi.vz * bi.mass;
|
||||
}
|
||||
let sun = &mut bodies[0];
|
||||
sun.vx = - px / SOLAR_MASS;
|
||||
sun.vy = - py / SOLAR_MASS;
|
||||
sun.vz = - pz / SOLAR_MASS;
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let n = std::env::args().nth(1).expect("need one arg").parse().unwrap();
|
||||
let mut bodies = BODIES;
|
||||
|
||||
offset_momentum(&mut bodies);
|
||||
println!("{:.9}", energy(&bodies));
|
||||
|
||||
advance(&mut bodies, 0.01, n);
|
||||
|
||||
println!("{:.9}", energy(&bodies));
|
||||
}
|
||||
|
||||
/// Pop a mutable reference off the head of a slice, mutating the slice to no
|
||||
/// longer contain the mutable reference.
|
||||
fn shift_mut_ref<'a, T>(r: &mut &'a mut [T]) -> Option<&'a mut T> {
|
||||
if r.len() == 0 { return None }
|
||||
let tmp = std::mem::replace(r, &mut []);
|
||||
let (h, t) = tmp.split_at_mut(1);
|
||||
*r = t;
|
||||
Some(&mut h[0])
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
#![feature(cfg_target_feature)]
|
||||
|
||||
extern crate simd;
|
||||
|
||||
#[cfg(target_feature = "sse2")]
|
||||
use simd::x86::sse2::*;
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
use simd::aarch64::neon::*;
|
||||
|
||||
const PI: f64 = 3.141592653589793;
|
||||
const SOLAR_MASS: f64 = 4.0 * PI * PI;
|
||||
const DAYS_PER_YEAR: f64 = 365.24;
|
||||
|
||||
struct Body {
|
||||
x: [f64; 3],
|
||||
_fill: f64,
|
||||
v: [f64; 3],
|
||||
mass: f64,
|
||||
}
|
||||
|
||||
impl Body {
|
||||
fn new(x0: f64, x1: f64, x2: f64,
|
||||
v0: f64, v1: f64, v2: f64,
|
||||
mass: f64) -> Body {
|
||||
Body {
|
||||
x: [x0, x1, x2],
|
||||
_fill: 0.0,
|
||||
v: [v0, v1, v2],
|
||||
mass: mass,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const N_BODIES: usize = 5;
|
||||
const N: usize = N_BODIES * (N_BODIES - 1) / 2;
|
||||
fn offset_momentum(bodies: &mut [Body; N_BODIES]) {
|
||||
let (sun, rest) = bodies.split_at_mut(1);
|
||||
let sun = &mut sun[0];
|
||||
for body in rest {
|
||||
for k in 0..3 {
|
||||
sun.v[k] -= body.v[k] * body.mass / SOLAR_MASS;
|
||||
}
|
||||
}
|
||||
}
|
||||
fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
|
||||
let mut r = [[0.0; 4]; N];
|
||||
let mut mag = [0.0; N];
|
||||
|
||||
let mut dx = [f64x2::splat(0.0); 3];
|
||||
let mut dsquared;
|
||||
let mut distance;
|
||||
let mut dmag;
|
||||
|
||||
let mut i = 0;
|
||||
for j in 0..N_BODIES {
|
||||
for k in j+1..N_BODIES {
|
||||
for m in 0..3 {
|
||||
r[i][m] = bodies[j].x[m] - bodies[k].x[m];
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
i = 0;
|
||||
while i < N {
|
||||
for m in 0..3 {
|
||||
dx[m] = f64x2::new(r[i][m], r[i+1][m]);
|
||||
}
|
||||
|
||||
dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
|
||||
distance = dsquared.to_f32().approx_rsqrt().to_f64();
|
||||
for _ in 0..2 {
|
||||
distance = distance * f64x2::splat(1.5) -
|
||||
((f64x2::splat(0.5) * dsquared) * distance) * (distance * distance)
|
||||
}
|
||||
dmag = f64x2::splat(dt) / dsquared * distance;
|
||||
dmag.store(&mut mag, i);
|
||||
|
||||
i += 2;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
for j in 0..N_BODIES {
|
||||
for k in j+1..N_BODIES {
|
||||
for m in 0..3 {
|
||||
bodies[j].v[m] -= r[i][m] * bodies[k].mass * mag[i];
|
||||
bodies[k].v[m] += r[i][m] * bodies[j].mass * mag[i];
|
||||
}
|
||||
i += 1
|
||||
}
|
||||
}
|
||||
for body in bodies {
|
||||
for m in 0..3 {
|
||||
body.x[m] += dt * body.v[m]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn energy(bodies: &[Body; N_BODIES]) -> f64 {
|
||||
let mut e = 0.0;
|
||||
for i in 0..N_BODIES {
|
||||
let bi = &bodies[i];
|
||||
e += bi.mass * (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2]) / 2.0;
|
||||
for j in i+1..N_BODIES {
|
||||
let bj = &bodies[j];
|
||||
let mut dx = [0.0; 3];
|
||||
for k in 0..3 {
|
||||
dx[k] = bi.x[k] - bj.x[k];
|
||||
}
|
||||
let mut distance = 0.0;
|
||||
for &d in &dx { distance += d * d }
|
||||
e -= bi.mass * bj.mass / distance.sqrt()
|
||||
}
|
||||
}
|
||||
e
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut bodies: [Body; N_BODIES] = [
|
||||
/* sun */
|
||||
Body::new(0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0,
|
||||
SOLAR_MASS),
|
||||
/* jupiter */
|
||||
Body::new(4.84143144246472090e+00,
|
||||
-1.16032004402742839e+00,
|
||||
-1.03622044471123109e-01 ,
|
||||
1.66007664274403694e-03 * DAYS_PER_YEAR,
|
||||
7.69901118419740425e-03 * DAYS_PER_YEAR,
|
||||
-6.90460016972063023e-05 * DAYS_PER_YEAR ,
|
||||
9.54791938424326609e-04 * SOLAR_MASS
|
||||
),
|
||||
/* saturn */
|
||||
Body::new(8.34336671824457987e+00,
|
||||
4.12479856412430479e+00,
|
||||
-4.03523417114321381e-01 ,
|
||||
-2.76742510726862411e-03 * DAYS_PER_YEAR,
|
||||
4.99852801234917238e-03 * DAYS_PER_YEAR,
|
||||
2.30417297573763929e-05 * DAYS_PER_YEAR ,
|
||||
2.85885980666130812e-04 * SOLAR_MASS
|
||||
),
|
||||
/* uranus */
|
||||
Body::new(1.28943695621391310e+01,
|
||||
-1.51111514016986312e+01,
|
||||
-2.23307578892655734e-01 ,
|
||||
2.96460137564761618e-03 * DAYS_PER_YEAR,
|
||||
2.37847173959480950e-03 * DAYS_PER_YEAR,
|
||||
-2.96589568540237556e-05 * DAYS_PER_YEAR ,
|
||||
4.36624404335156298e-05 * SOLAR_MASS
|
||||
),
|
||||
/* neptune */
|
||||
Body::new(1.53796971148509165e+01,
|
||||
-2.59193146099879641e+01,
|
||||
1.79258772950371181e-01 ,
|
||||
2.68067772490389322e-03 * DAYS_PER_YEAR,
|
||||
1.62824170038242295e-03 * DAYS_PER_YEAR,
|
||||
-9.51592254519715870e-05 * DAYS_PER_YEAR ,
|
||||
5.15138902046611451e-05 * SOLAR_MASS
|
||||
)
|
||||
];
|
||||
|
||||
let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
|
||||
|
||||
offset_momentum(&mut bodies);
|
||||
println!("{:.9}", energy(&bodies));
|
||||
for _ in 0..n {
|
||||
advance(&mut bodies, 0.01);
|
||||
}
|
||||
println!("{:.9}", energy(&bodies));
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
extern crate simd;
|
||||
|
||||
use simd::*;
|
||||
|
||||
fn main() {
|
||||
let x = i32x4::splat(1_i32);
|
||||
let y = -x;
|
||||
let z = !x;
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
// The Computer Language Benchmarks Game
|
||||
// http://benchmarksgame.alioth.debian.org/
|
||||
//
|
||||
// contributed by the Rust Project Developers
|
||||
// contributed by TeXitoi
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use std::iter::repeat;
|
||||
//use std::thread;
|
||||
|
||||
// As std::simd::f64x2 is unstable, we provide a similar interface,
|
||||
// expecting llvm to autovectorize its usage.
|
||||
#[allow(non_camel_case_types)]
|
||||
struct f64x2(f64, f64);
|
||||
impl std::ops::Add for f64x2 {
|
||||
type Output = Self;
|
||||
fn add(self, rhs: Self) -> Self {
|
||||
f64x2(self.0 + rhs.0, self.1 + rhs.1)
|
||||
}
|
||||
}
|
||||
impl std::ops::Div for f64x2 {
|
||||
type Output = Self;
|
||||
fn div(self, rhs: Self) -> Self {
|
||||
f64x2(self.0 / rhs.0, self.1 / rhs.1)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
|
||||
let answer = spectralnorm(n);
|
||||
println!("{:.9}", answer);
|
||||
}
|
||||
|
||||
fn spectralnorm(n: usize) -> f64 {
|
||||
assert!(n % 2 == 0, "only even lengths are accepted");
|
||||
let mut u = repeat(1.0).take(n).collect::<Vec<_>>();
|
||||
let mut v = u.clone();
|
||||
let mut tmp = v.clone();
|
||||
for _ in 0..10 {
|
||||
mult_AtAv(&u, &mut v, &mut tmp);
|
||||
mult_AtAv(&v, &mut u, &mut tmp);
|
||||
}
|
||||
(dot(&u, &v) / dot(&v, &v)).sqrt()
|
||||
}
|
||||
|
||||
fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
|
||||
mult_Av(v, tmp);
|
||||
mult_Atv(tmp, out);
|
||||
}
|
||||
|
||||
fn mult_Av(v: &[f64], out: &mut [f64]) {
|
||||
parallel(out, |start, out| mult(v, out, start, |i, j| A(i, j)));
|
||||
}
|
||||
|
||||
fn mult_Atv(v: &[f64], out: &mut [f64]) {
|
||||
parallel(out, |start, out| mult(v, out, start, |i, j| A(j, i)));
|
||||
}
|
||||
|
||||
fn mult<F>(v: &[f64], out: &mut [f64], start: usize, a: F)
|
||||
where F: Fn(usize, usize) -> f64 {
|
||||
for (i, slot) in out.iter_mut().enumerate().map(|(i, s)| (i + start, s)) {
|
||||
let mut sum = f64x2(0.0, 0.0);
|
||||
for (j, chunk) in v.chunks(2).enumerate().map(|(j, s)| (2 * j, s)) {
|
||||
let top = f64x2(chunk[0], chunk[1]);
|
||||
let bot = f64x2(a(i, j), a(i, j + 1));
|
||||
sum = sum + top / bot;
|
||||
}
|
||||
let f64x2(a, b) = sum;
|
||||
*slot = a + b;
|
||||
}
|
||||
}
|
||||
|
||||
fn A(i: usize, j: usize) -> f64 {
|
||||
((i + j) * (i + j + 1) / 2 + i + 1) as f64
|
||||
}
|
||||
|
||||
fn dot(v: &[f64], u: &[f64]) -> f64 {
|
||||
v.iter().zip(u.iter()).map(|(a, b)| *a * *b).fold(0., |acc, i| acc + i)
|
||||
}
|
||||
|
||||
//struct Racy<T>(T);
|
||||
//unsafe impl<T: 'static> Send for Racy<T> {}
|
||||
|
||||
// Executes a closure in parallel over the given mutable slice. The closure `f`
|
||||
// is run in parallel and yielded the starting index within `v` as well as a
|
||||
// sub-slice of `v`.
|
||||
fn parallel<'a, T, F>(v: &mut [T], ref f: F)
|
||||
where T: 'static + Send + Sync,
|
||||
F: Fn(usize, &mut [T]) + Sync
|
||||
{
|
||||
f(0, v);
|
||||
/*let size = v.len() / 4 + 1;
|
||||
let jhs = v.chunks_mut(size).enumerate().map(|(i, chunk)| {
|
||||
// Need to convert `f` and `chunk` to something that can cross the task
|
||||
// boundary.
|
||||
let f = Racy(f as *const F as *const usize);
|
||||
let raw = Racy((&mut chunk[0] as *mut T, chunk.len()));
|
||||
thread::spawn(move|| {
|
||||
let f = f.0 as *const F;
|
||||
let raw = raw.0;
|
||||
unsafe { (*f)(i * size, std::slice::from_raw_parts_mut(raw.0, raw.1)) }
|
||||
})
|
||||
}).collect::<Vec<_>>();
|
||||
for jh in jhs { jh.join().unwrap(); }*/
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
#![feature(cfg_target_feature)]
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
extern crate simd;
|
||||
|
||||
#[cfg(target_feature = "sse2")]
|
||||
use simd::x86::sse2::f64x2;
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
use simd::aarch64::neon::f64x2;
|
||||
|
||||
fn A(i: usize, j: usize) -> f64 {
|
||||
((i + j) * (i + j + 1) / 2 + i + 1) as f64
|
||||
}
|
||||
|
||||
fn dot(x: &[f64], y: &[f64]) -> f64 {
|
||||
x.iter().zip(y).map(|(&x, &y)| x * y).fold(0.0, |a, b| a + b)
|
||||
}
|
||||
|
||||
fn mult_Av(v: &[f64], out: &mut [f64]) {
|
||||
assert!(v.len() == out.len());
|
||||
assert!(v.len() % 2 == 0);
|
||||
|
||||
for i in 0..v.len() {
|
||||
let mut sum = f64x2::splat(0.0);
|
||||
|
||||
let mut j = 0;
|
||||
while j < v.len() {
|
||||
let b = f64x2::load(v, j);
|
||||
let a = f64x2::new(A(i, j), A(i, j + 1));
|
||||
sum = sum + b / a;
|
||||
j += 2
|
||||
}
|
||||
out[i] = sum.extract(0) + sum.extract(1);
|
||||
}
|
||||
}
|
||||
|
||||
fn mult_Atv(v: &[f64], out: &mut [f64]) {
|
||||
assert!(v.len() == out.len());
|
||||
assert!(v.len() % 2 == 0);
|
||||
|
||||
for i in 0..v.len() {
|
||||
let mut sum = f64x2::splat(0.0);
|
||||
|
||||
let mut j = 0;
|
||||
while j < v.len() {
|
||||
let b = f64x2::load(v, j);
|
||||
let a = f64x2::new(A(j, i), A(j + 1, i));
|
||||
sum = sum + b / a;
|
||||
j += 2
|
||||
}
|
||||
out[i] = sum.extract(0) + sum.extract(1);
|
||||
}
|
||||
}
|
||||
|
||||
fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
|
||||
mult_Av(v, tmp);
|
||||
mult_Atv(tmp, out);
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
|
||||
if n % 2 == 1 { n += 1 }
|
||||
|
||||
let mut u = vec![1.0; n];
|
||||
let mut v = u.clone();
|
||||
let mut tmp = u.clone();
|
||||
|
||||
for _ in 0..10 {
|
||||
mult_AtAv(&u, &mut v, &mut tmp);
|
||||
mult_AtAv(&v, &mut u, &mut tmp);
|
||||
}
|
||||
|
||||
println!("{:.9}", (dot(&u, &v) / dot(&v, &v)).sqrt());
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
//! Features specific to AArch64 CPUs.
|
||||
|
||||
pub mod neon;
|
|
@ -0,0 +1,681 @@
|
|||
use super::super::*;
|
||||
use {simd_cast, f32x2};
|
||||
|
||||
pub use sixty_four::{f64x2, i64x2, u64x2, bool64ix2, bool64fx2};
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct u32x2(u32, u32);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct i32x2(i32, i32);
|
||||
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct u16x4(u16, u16, u16, u16);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct i16x4(i16, i16, i16, i16);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct u8x8(u8, u8, u8, u8,
|
||||
u8, u8, u8, u8);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct i8x8(i8, i8, i8, i8,
|
||||
i8, i8, i8, i8);
|
||||
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct i64x1(i64);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct u64x1(u64);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct f64x1(f64);
|
||||
|
||||
#[allow(dead_code)]
|
||||
extern "platform-intrinsic" {
|
||||
fn aarch64_vhadd_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vhadd_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vhadd_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vhadd_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vhadd_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vhadd_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vrhadd_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vrhadd_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vrhadd_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vrhadd_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vrhadd_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vrhadd_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vrhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vrhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vrhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vrhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vrhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vrhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vqadd_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vqadd_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vqadd_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vqadd_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vqadd_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vqadd_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vqadd_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn aarch64_vqadd_u64(x: u64x1, y: u64x1) -> u64x1;
|
||||
fn aarch64_vqaddq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vqaddq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vqaddq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vqaddq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vqaddq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vqaddq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vqaddq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vqaddq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn aarch64_vuqadd_s8(x: i8x16, y: u8x16) -> i8x16;
|
||||
fn aarch64_vuqadd_s16(x: i16x8, y: u16x8) -> i16x8;
|
||||
fn aarch64_vuqadd_s32(x: i32x4, y: u32x4) -> i32x4;
|
||||
fn aarch64_vuqadd_s64(x: i64x2, y: u64x2) -> i64x2;
|
||||
fn aarch64_vsqadd_u8(x: u8x16, y: i8x16) -> u8x16;
|
||||
fn aarch64_vsqadd_u16(x: u16x8, y: i16x8) -> u16x8;
|
||||
fn aarch64_vsqadd_u32(x: u32x4, y: i32x4) -> u32x4;
|
||||
fn aarch64_vsqadd_u64(x: u64x2, y: i64x2) -> u64x2;
|
||||
fn aarch64_vraddhn_s16(x: i16x8, y: i16x8) -> i8x8;
|
||||
fn aarch64_vraddhn_u16(x: u16x8, y: u16x8) -> u8x8;
|
||||
fn aarch64_vraddhn_s32(x: i32x4, y: i32x4) -> i16x4;
|
||||
fn aarch64_vraddhn_u32(x: u32x4, y: u32x4) -> u16x4;
|
||||
fn aarch64_vraddhn_s64(x: i64x2, y: i64x2) -> i32x2;
|
||||
fn aarch64_vraddhn_u64(x: u64x2, y: u64x2) -> u32x2;
|
||||
fn aarch64_vfmulx_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vfmulx_f64(x: f64x1, y: f64x1) -> f64x1;
|
||||
fn aarch64_vfmulxq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vfmulxq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vfma_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vfma_f64(x: f64x1, y: f64x1) -> f64x1;
|
||||
fn aarch64_vfmaq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vfmaq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vqdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vqdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vqdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vqdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vqrdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vqrdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vqrdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vqrdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vmull_s8(x: i8x8, y: i8x8) -> i16x8;
|
||||
fn aarch64_vmull_u8(x: u8x8, y: u8x8) -> u16x8;
|
||||
fn aarch64_vmull_s16(x: i16x4, y: i16x4) -> i32x4;
|
||||
fn aarch64_vmull_u16(x: u16x4, y: u16x4) -> u32x4;
|
||||
fn aarch64_vmull_s32(x: i32x2, y: i32x2) -> i64x2;
|
||||
fn aarch64_vmull_u32(x: u32x2, y: u32x2) -> u64x2;
|
||||
fn aarch64_vqdmullq_s8(x: i8x8, y: i8x8) -> i16x8;
|
||||
fn aarch64_vqdmullq_s16(x: i16x4, y: i16x4) -> i32x4;
|
||||
fn aarch64_vhsub_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vhsub_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vhsub_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vhsub_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vhsub_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vhsub_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vhsubq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vhsubq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vhsubq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vhsubq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vhsubq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vhsubq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vqsub_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vqsub_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vqsub_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vqsub_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vqsub_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vqsub_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vqsub_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn aarch64_vqsub_u64(x: u64x1, y: u64x1) -> u64x1;
|
||||
fn aarch64_vqsubq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vqsubq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vqsubq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vqsubq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vqsubq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vqsubq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vqsubq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vqsubq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn aarch64_vrsubhn_s16(x: i16x8, y: i16x8) -> i8x8;
|
||||
fn aarch64_vrsubhn_u16(x: u16x8, y: u16x8) -> u8x8;
|
||||
fn aarch64_vrsubhn_s32(x: i32x4, y: i32x4) -> i16x4;
|
||||
fn aarch64_vrsubhn_u32(x: u32x4, y: u32x4) -> u16x4;
|
||||
fn aarch64_vrsubhn_s64(x: i64x2, y: i64x2) -> i32x2;
|
||||
fn aarch64_vrsubhn_u64(x: u64x2, y: u64x2) -> u32x2;
|
||||
fn aarch64_vabd_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vabd_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vabd_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vabd_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vabd_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vabd_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vabd_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vabd_f64(x: f64x1, y: f64x1) -> f64x1;
|
||||
fn aarch64_vabdq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vabdq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vabdq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vabdq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vabdq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vabdq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vabdq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vabdq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vmax_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vmax_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vmax_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vmax_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vmax_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vmax_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vmax_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vmax_f64(x: f64x1, y: f64x1) -> f64x1;
|
||||
fn aarch64_vmaxq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vmaxq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vmaxq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vmaxq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vmaxq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vmaxq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vmaxq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vmaxq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vmin_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vmin_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vmin_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vmin_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vmin_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vmin_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vmin_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vmin_f64(x: f64x1, y: f64x1) -> f64x1;
|
||||
fn aarch64_vminq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vminq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vminq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vminq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vminq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vminq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vminq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vminq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vmaxnm_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vmaxnm_f64(x: f64x1, y: f64x1) -> f64x1;
|
||||
fn aarch64_vmaxnmq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vmaxnmq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vminnm_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vminnm_f64(x: f64x1, y: f64x1) -> f64x1;
|
||||
fn aarch64_vminnmq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vminnmq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vshl_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vshl_u8(x: u8x8, y: i8x8) -> u8x8;
|
||||
fn aarch64_vshl_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vshl_u16(x: u16x4, y: i16x4) -> u16x4;
|
||||
fn aarch64_vshl_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vshl_u32(x: u32x2, y: i32x2) -> u32x2;
|
||||
fn aarch64_vshl_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn aarch64_vshl_u64(x: u64x1, y: i64x1) -> u64x1;
|
||||
fn aarch64_vshlq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vshlq_u8(x: u8x16, y: i8x16) -> u8x16;
|
||||
fn aarch64_vshlq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vshlq_u16(x: u16x8, y: i16x8) -> u16x8;
|
||||
fn aarch64_vshlq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vshlq_u32(x: u32x4, y: i32x4) -> u32x4;
|
||||
fn aarch64_vshlq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vshlq_u64(x: u64x2, y: i64x2) -> u64x2;
|
||||
fn aarch64_vqshl_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vqshl_u8(x: u8x8, y: i8x8) -> u8x8;
|
||||
fn aarch64_vqshl_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vqshl_u16(x: u16x4, y: i16x4) -> u16x4;
|
||||
fn aarch64_vqshl_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vqshl_u32(x: u32x2, y: i32x2) -> u32x2;
|
||||
fn aarch64_vqshl_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn aarch64_vqshl_u64(x: u64x1, y: i64x1) -> u64x1;
|
||||
fn aarch64_vqshlq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vqshlq_u8(x: u8x16, y: i8x16) -> u8x16;
|
||||
fn aarch64_vqshlq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vqshlq_u16(x: u16x8, y: i16x8) -> u16x8;
|
||||
fn aarch64_vqshlq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vqshlq_u32(x: u32x4, y: i32x4) -> u32x4;
|
||||
fn aarch64_vqshlq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vqshlq_u64(x: u64x2, y: i64x2) -> u64x2;
|
||||
fn aarch64_vrshl_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vrshl_u8(x: u8x8, y: i8x8) -> u8x8;
|
||||
fn aarch64_vrshl_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vrshl_u16(x: u16x4, y: i16x4) -> u16x4;
|
||||
fn aarch64_vrshl_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vrshl_u32(x: u32x2, y: i32x2) -> u32x2;
|
||||
fn aarch64_vrshl_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn aarch64_vrshl_u64(x: u64x1, y: i64x1) -> u64x1;
|
||||
fn aarch64_vrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
|
||||
fn aarch64_vrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
|
||||
fn aarch64_vrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
|
||||
fn aarch64_vrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
|
||||
fn aarch64_vqrshl_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vqrshl_u8(x: u8x8, y: i8x8) -> u8x8;
|
||||
fn aarch64_vqrshl_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vqrshl_u16(x: u16x4, y: i16x4) -> u16x4;
|
||||
fn aarch64_vqrshl_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vqrshl_u32(x: u32x2, y: i32x2) -> u32x2;
|
||||
fn aarch64_vqrshl_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn aarch64_vqrshl_u64(x: u64x1, y: i64x1) -> u64x1;
|
||||
fn aarch64_vqrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vqrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
|
||||
fn aarch64_vqrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vqrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
|
||||
fn aarch64_vqrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vqrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
|
||||
fn aarch64_vqrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vqrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
|
||||
fn aarch64_vqshrun_n_s16(x: i16x8, y: u32) -> i8x8;
|
||||
fn aarch64_vqshrun_n_s32(x: i32x4, y: u32) -> i16x4;
|
||||
fn aarch64_vqshrun_n_s64(x: i64x2, y: u32) -> i32x2;
|
||||
fn aarch64_vqrshrun_n_s16(x: i16x8, y: u32) -> i8x8;
|
||||
fn aarch64_vqrshrun_n_s32(x: i32x4, y: u32) -> i16x4;
|
||||
fn aarch64_vqrshrun_n_s64(x: i64x2, y: u32) -> i32x2;
|
||||
fn aarch64_vqshrn_n_s16(x: i16x8, y: u32) -> i8x8;
|
||||
fn aarch64_vqshrn_n_u16(x: u16x8, y: u32) -> u8x8;
|
||||
fn aarch64_vqshrn_n_s32(x: i32x4, y: u32) -> i16x4;
|
||||
fn aarch64_vqshrn_n_u32(x: u32x4, y: u32) -> u16x4;
|
||||
fn aarch64_vqshrn_n_s64(x: i64x2, y: u32) -> i32x2;
|
||||
fn aarch64_vqshrn_n_u64(x: u64x2, y: u32) -> u32x2;
|
||||
fn aarch64_vrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
|
||||
fn aarch64_vrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
|
||||
fn aarch64_vrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
|
||||
fn aarch64_vrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
|
||||
fn aarch64_vrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
|
||||
fn aarch64_vrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
|
||||
fn aarch64_vqrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
|
||||
fn aarch64_vqrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
|
||||
fn aarch64_vqrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
|
||||
fn aarch64_vqrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
|
||||
fn aarch64_vqrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
|
||||
fn aarch64_vqrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
|
||||
fn aarch64_vsri_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vsri_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vsri_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vsri_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vsri_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vsri_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vsri_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn aarch64_vsri_u64(x: u64x1, y: u64x1) -> u64x1;
|
||||
fn aarch64_vsriq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vsriq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vsriq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vsriq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vsriq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vsriq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vsriq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vsriq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn aarch64_vsli_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vsli_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vsli_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vsli_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vsli_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vsli_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vsli_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn aarch64_vsli_u64(x: u64x1, y: u64x1) -> u64x1;
|
||||
fn aarch64_vsliq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vsliq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vsliq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vsliq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vsliq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vsliq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vsliq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vsliq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn aarch64_vvqmovn_s16(x: i16x8) -> i8x8;
|
||||
fn aarch64_vvqmovn_u16(x: u16x8) -> u8x8;
|
||||
fn aarch64_vvqmovn_s32(x: i32x4) -> i16x4;
|
||||
fn aarch64_vvqmovn_u32(x: u32x4) -> u16x4;
|
||||
fn aarch64_vvqmovn_s64(x: i64x2) -> i32x2;
|
||||
fn aarch64_vvqmovn_u64(x: u64x2) -> u32x2;
|
||||
fn aarch64_vabs_s8(x: i8x8) -> i8x8;
|
||||
fn aarch64_vabs_s16(x: i16x4) -> i16x4;
|
||||
fn aarch64_vabs_s32(x: i32x2) -> i32x2;
|
||||
fn aarch64_vabs_s64(x: i64x1) -> i64x1;
|
||||
fn aarch64_vabsq_s8(x: i8x16) -> i8x16;
|
||||
fn aarch64_vabsq_s16(x: i16x8) -> i16x8;
|
||||
fn aarch64_vabsq_s32(x: i32x4) -> i32x4;
|
||||
fn aarch64_vabsq_s64(x: i64x2) -> i64x2;
|
||||
fn aarch64_vabs_f32(x: f32x2) -> f32x2;
|
||||
fn aarch64_vabs_f64(x: f64x1) -> f64x1;
|
||||
fn aarch64_vabsq_f32(x: f32x4) -> f32x4;
|
||||
fn aarch64_vabsq_f64(x: f64x2) -> f64x2;
|
||||
fn aarch64_vqabs_s8(x: i8x8) -> i8x8;
|
||||
fn aarch64_vqabs_s16(x: i16x4) -> i16x4;
|
||||
fn aarch64_vqabs_s32(x: i32x2) -> i32x2;
|
||||
fn aarch64_vqabs_s64(x: i64x1) -> i64x1;
|
||||
fn aarch64_vqabsq_s8(x: i8x16) -> i8x16;
|
||||
fn aarch64_vqabsq_s16(x: i16x8) -> i16x8;
|
||||
fn aarch64_vqabsq_s32(x: i32x4) -> i32x4;
|
||||
fn aarch64_vqabsq_s64(x: i64x2) -> i64x2;
|
||||
fn aarch64_vqneg_s8(x: i8x8) -> i8x8;
|
||||
fn aarch64_vqneg_s16(x: i16x4) -> i16x4;
|
||||
fn aarch64_vqneg_s32(x: i32x2) -> i32x2;
|
||||
fn aarch64_vqneg_s64(x: i64x1) -> i64x1;
|
||||
fn aarch64_vqnegq_s8(x: i8x16) -> i8x16;
|
||||
fn aarch64_vqnegq_s16(x: i16x8) -> i16x8;
|
||||
fn aarch64_vqnegq_s32(x: i32x4) -> i32x4;
|
||||
fn aarch64_vqnegq_s64(x: i64x2) -> i64x2;
|
||||
fn aarch64_vclz_s8(x: i8x8) -> i8x8;
|
||||
fn aarch64_vclz_u8(x: u8x8) -> u8x8;
|
||||
fn aarch64_vclz_s16(x: i16x4) -> i16x4;
|
||||
fn aarch64_vclz_u16(x: u16x4) -> u16x4;
|
||||
fn aarch64_vclz_s32(x: i32x2) -> i32x2;
|
||||
fn aarch64_vclz_u32(x: u32x2) -> u32x2;
|
||||
fn aarch64_vclzq_s8(x: i8x16) -> i8x16;
|
||||
fn aarch64_vclzq_u8(x: u8x16) -> u8x16;
|
||||
fn aarch64_vclzq_s16(x: i16x8) -> i16x8;
|
||||
fn aarch64_vclzq_u16(x: u16x8) -> u16x8;
|
||||
fn aarch64_vclzq_s32(x: i32x4) -> i32x4;
|
||||
fn aarch64_vclzq_u32(x: u32x4) -> u32x4;
|
||||
fn aarch64_vcls_s8(x: i8x8) -> i8x8;
|
||||
fn aarch64_vcls_u8(x: u8x8) -> u8x8;
|
||||
fn aarch64_vcls_s16(x: i16x4) -> i16x4;
|
||||
fn aarch64_vcls_u16(x: u16x4) -> u16x4;
|
||||
fn aarch64_vcls_s32(x: i32x2) -> i32x2;
|
||||
fn aarch64_vcls_u32(x: u32x2) -> u32x2;
|
||||
fn aarch64_vclsq_s8(x: i8x16) -> i8x16;
|
||||
fn aarch64_vclsq_u8(x: u8x16) -> u8x16;
|
||||
fn aarch64_vclsq_s16(x: i16x8) -> i16x8;
|
||||
fn aarch64_vclsq_u16(x: u16x8) -> u16x8;
|
||||
fn aarch64_vclsq_s32(x: i32x4) -> i32x4;
|
||||
fn aarch64_vclsq_u32(x: u32x4) -> u32x4;
|
||||
fn aarch64_vcnt_s8(x: i8x8) -> i8x8;
|
||||
fn aarch64_vcnt_u8(x: u8x8) -> u8x8;
|
||||
fn aarch64_vcntq_s8(x: i8x16) -> i8x16;
|
||||
fn aarch64_vcntq_u8(x: u8x16) -> u8x16;
|
||||
fn aarch64_vrecpe_u32(x: u32x2) -> u32x2;
|
||||
fn aarch64_vrecpe_f32(x: f32x2) -> f32x2;
|
||||
fn aarch64_vrecpe_f64(x: f64x1) -> f64x1;
|
||||
fn aarch64_vrecpeq_u32(x: u32x4) -> u32x4;
|
||||
fn aarch64_vrecpeq_f32(x: f32x4) -> f32x4;
|
||||
fn aarch64_vrecpeq_f64(x: f64x2) -> f64x2;
|
||||
fn aarch64_vrecps_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vrecps_f64(x: f64x1, y: f64x1) -> f64x1;
|
||||
fn aarch64_vrecpsq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vrecpsq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vsqrt_f32(x: f32x2) -> f32x2;
|
||||
fn aarch64_vsqrt_f64(x: f64x1) -> f64x1;
|
||||
fn aarch64_vsqrtq_f32(x: f32x4) -> f32x4;
|
||||
fn aarch64_vsqrtq_f64(x: f64x2) -> f64x2;
|
||||
fn aarch64_vrsqrte_u32(x: u32x2) -> u32x2;
|
||||
fn aarch64_vrsqrte_f32(x: f32x2) -> f32x2;
|
||||
fn aarch64_vrsqrte_f64(x: f64x1) -> f64x1;
|
||||
fn aarch64_vrsqrteq_u32(x: u32x4) -> u32x4;
|
||||
fn aarch64_vrsqrteq_f32(x: f32x4) -> f32x4;
|
||||
fn aarch64_vrsqrteq_f64(x: f64x2) -> f64x2;
|
||||
fn aarch64_vrsqrts_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vrsqrts_f64(x: f64x1, y: f64x1) -> f64x1;
|
||||
fn aarch64_vrsqrtsq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vrsqrtsq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vrbit_s8(x: i8x8) -> i8x8;
|
||||
fn aarch64_vrbit_u8(x: u8x8) -> u8x8;
|
||||
fn aarch64_vrbitq_s8(x: i8x16) -> i8x16;
|
||||
fn aarch64_vrbitq_u8(x: u8x16) -> u8x16;
|
||||
fn aarch64_vpadd_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vpadd_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vpadd_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vpadd_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vpadd_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vpadd_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vpadd_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vpaddq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vpaddq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vpaddq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vpaddq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vpaddq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vpaddq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vpaddq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vpaddq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vpaddq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn aarch64_vpaddq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vpaddl_s16(x: i8x8) -> i16x4;
|
||||
fn aarch64_vpaddl_u16(x: u8x8) -> u16x4;
|
||||
fn aarch64_vpaddl_s32(x: i16x4) -> i32x2;
|
||||
fn aarch64_vpaddl_u32(x: u16x4) -> u32x2;
|
||||
fn aarch64_vpaddl_s64(x: i32x2) -> i64x1;
|
||||
fn aarch64_vpaddl_u64(x: u32x2) -> u64x1;
|
||||
fn aarch64_vpaddlq_s16(x: i8x16) -> i16x8;
|
||||
fn aarch64_vpaddlq_u16(x: u8x16) -> u16x8;
|
||||
fn aarch64_vpaddlq_s32(x: i16x8) -> i32x4;
|
||||
fn aarch64_vpaddlq_u32(x: u16x8) -> u32x4;
|
||||
fn aarch64_vpaddlq_s64(x: i32x4) -> i64x2;
|
||||
fn aarch64_vpaddlq_u64(x: u32x4) -> u64x2;
|
||||
fn aarch64_vpmax_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vpmax_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vpmax_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vpmax_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vpmax_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vpmax_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vpmax_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vpmaxq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vpmaxq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vpmaxq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vpmaxq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vpmaxq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vpmaxq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vpmaxq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vpmaxq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vpmaxq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn aarch64_vpmaxq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vpmin_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vpmin_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vpmin_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vpmin_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vpmin_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vpmin_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vpmin_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vpminq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vpminq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vpminq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vpminq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vpminq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vpminq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vpminq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vpminq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vpminq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn aarch64_vpminq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vpmaxnm_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn aarch64_vpmaxnm_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn aarch64_vpmaxnm_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn aarch64_vpmaxnm_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn aarch64_vpmaxnm_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn aarch64_vpmaxnm_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn aarch64_vpmaxnm_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vpmaxnmq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn aarch64_vpmaxnmq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vpmaxnmq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn aarch64_vpmaxnmq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn aarch64_vpmaxnmq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn aarch64_vpmaxnmq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn aarch64_vpmaxnmq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vpmaxnmq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn aarch64_vpmaxnmq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn aarch64_vpmaxnmq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vpminnm_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn aarch64_vpminnmq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn aarch64_vpminnmq_f64(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn aarch64_vaddv_s8(x: i8x8) -> i8;
|
||||
fn aarch64_vaddv_u8(x: u8x8) -> u8;
|
||||
fn aarch64_vaddv_s16(x: i16x4) -> i16;
|
||||
fn aarch64_vaddv_u16(x: u16x4) -> u16;
|
||||
fn aarch64_vaddv_s32(x: i32x2) -> i32;
|
||||
fn aarch64_vaddv_u32(x: u32x2) -> u32;
|
||||
fn aarch64_vaddv_f32(x: f32x2) -> f32;
|
||||
fn aarch64_vaddvq_s8(x: i8x16) -> i8;
|
||||
fn aarch64_vaddvq_u8(x: u8x16) -> u8;
|
||||
fn aarch64_vaddvq_s16(x: i16x8) -> i16;
|
||||
fn aarch64_vaddvq_u16(x: u16x8) -> u16;
|
||||
fn aarch64_vaddvq_s32(x: i32x4) -> i32;
|
||||
fn aarch64_vaddvq_u32(x: u32x4) -> u32;
|
||||
fn aarch64_vaddvq_f32(x: f32x4) -> f32;
|
||||
fn aarch64_vaddvq_s64(x: i64x2) -> i64;
|
||||
fn aarch64_vaddvq_u64(x: u64x2) -> u64;
|
||||
fn aarch64_vaddvq_f64(x: f64x2) -> f64;
|
||||
fn aarch64_vaddlv_s8(x: i8x8) -> i16;
|
||||
fn aarch64_vaddlv_u8(x: u8x8) -> u16;
|
||||
fn aarch64_vaddlv_s16(x: i16x4) -> i32;
|
||||
fn aarch64_vaddlv_u16(x: u16x4) -> u32;
|
||||
fn aarch64_vaddlv_s32(x: i32x2) -> i64;
|
||||
fn aarch64_vaddlv_u32(x: u32x2) -> u64;
|
||||
fn aarch64_vaddlvq_s8(x: i8x16) -> i16;
|
||||
fn aarch64_vaddlvq_u8(x: u8x16) -> u16;
|
||||
fn aarch64_vaddlvq_s16(x: i16x8) -> i32;
|
||||
fn aarch64_vaddlvq_u16(x: u16x8) -> u32;
|
||||
fn aarch64_vaddlvq_s32(x: i32x4) -> i64;
|
||||
fn aarch64_vaddlvq_u32(x: u32x4) -> u64;
|
||||
fn aarch64_vmaxv_s8(x: i8x8) -> i8;
|
||||
fn aarch64_vmaxv_u8(x: u8x8) -> u8;
|
||||
fn aarch64_vmaxv_s16(x: i16x4) -> i16;
|
||||
fn aarch64_vmaxv_u16(x: u16x4) -> u16;
|
||||
fn aarch64_vmaxv_s32(x: i32x2) -> i32;
|
||||
fn aarch64_vmaxv_u32(x: u32x2) -> u32;
|
||||
fn aarch64_vmaxv_f32(x: f32x2) -> f32;
|
||||
fn aarch64_vmaxvq_s8(x: i8x16) -> i8;
|
||||
fn aarch64_vmaxvq_u8(x: u8x16) -> u8;
|
||||
fn aarch64_vmaxvq_s16(x: i16x8) -> i16;
|
||||
fn aarch64_vmaxvq_u16(x: u16x8) -> u16;
|
||||
fn aarch64_vmaxvq_s32(x: i32x4) -> i32;
|
||||
fn aarch64_vmaxvq_u32(x: u32x4) -> u32;
|
||||
fn aarch64_vmaxvq_f32(x: f32x4) -> f32;
|
||||
fn aarch64_vmaxvq_f64(x: f64x2) -> f64;
|
||||
fn aarch64_vminv_s8(x: i8x8) -> i8;
|
||||
fn aarch64_vminv_u8(x: u8x8) -> u8;
|
||||
fn aarch64_vminv_s16(x: i16x4) -> i16;
|
||||
fn aarch64_vminv_u16(x: u16x4) -> u16;
|
||||
fn aarch64_vminv_s32(x: i32x2) -> i32;
|
||||
fn aarch64_vminv_u32(x: u32x2) -> u32;
|
||||
fn aarch64_vminv_f32(x: f32x2) -> f32;
|
||||
fn aarch64_vminvq_s8(x: i8x16) -> i8;
|
||||
fn aarch64_vminvq_u8(x: u8x16) -> u8;
|
||||
fn aarch64_vminvq_s16(x: i16x8) -> i16;
|
||||
fn aarch64_vminvq_u16(x: u16x8) -> u16;
|
||||
fn aarch64_vminvq_s32(x: i32x4) -> i32;
|
||||
fn aarch64_vminvq_u32(x: u32x4) -> u32;
|
||||
fn aarch64_vminvq_f32(x: f32x4) -> f32;
|
||||
fn aarch64_vminvq_f64(x: f64x2) -> f64;
|
||||
fn aarch64_vmaxnmv_f32(x: f32x2) -> f32;
|
||||
fn aarch64_vmaxnmvq_f32(x: f32x4) -> f32;
|
||||
fn aarch64_vmaxnmvq_f64(x: f64x2) -> f64;
|
||||
fn aarch64_vminnmv_f32(x: f32x2) -> f32;
|
||||
fn aarch64_vminnmvq_f32(x: f32x4) -> f32;
|
||||
fn aarch64_vminnmvq_f64(x: f64x2) -> f64;
|
||||
fn aarch64_vqtbl1_s8(x: i8x16, y: u8x8) -> i8x8;
|
||||
fn aarch64_vqtbl1_u8(x: u8x16, y: u8x8) -> u8x8;
|
||||
fn aarch64_vqtbl1q_s8(x: i8x16, y: u8x16) -> i8x16;
|
||||
fn aarch64_vqtbl1q_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn aarch64_vqtbx1_s8(x: i8x8, y: i8x16, z: u8x8) -> i8x8;
|
||||
fn aarch64_vqtbx1_u8(x: u8x8, y: u8x16, z: u8x8) -> u8x8;
|
||||
fn aarch64_vqtbx1q_s8(x: i8x16, y: i8x16, z: u8x16) -> i8x16;
|
||||
fn aarch64_vqtbx1q_u8(x: u8x16, y: u8x16, z: u8x16) -> u8x16;
|
||||
fn aarch64_vqtbl2_s8(x: (i8x16, i8x16), y: u8x8) -> i8x8;
|
||||
fn aarch64_vqtbl2_u8(x: (u8x16, u8x16), y: u8x8) -> u8x8;
|
||||
fn aarch64_vqtbl2q_s8(x: (i8x16, i8x16), y: u8x16) -> i8x16;
|
||||
fn aarch64_vqtbl2q_u8(x: (u8x16, u8x16), y: u8x16) -> u8x16;
|
||||
fn aarch64_vqtbx2_s8(x: (i8x16, i8x16), y: u8x8) -> i8x8;
|
||||
fn aarch64_vqtbx2_u8(x: (u8x16, u8x16), y: u8x8) -> u8x8;
|
||||
fn aarch64_vqtbx2q_s8(x: (i8x16, i8x16), y: u8x16) -> i8x16;
|
||||
fn aarch64_vqtbx2q_u8(x: (u8x16, u8x16), y: u8x16) -> u8x16;
|
||||
fn aarch64_vqtbl3_s8(x: (i8x16, i8x16, i8x16), y: u8x8) -> i8x8;
|
||||
fn aarch64_vqtbl3_u8(x: (u8x16, u8x16, u8x16), y: u8x8) -> u8x8;
|
||||
fn aarch64_vqtbl3q_s8(x: (i8x16, i8x16, i8x16), y: u8x16) -> i8x16;
|
||||
fn aarch64_vqtbl3q_u8(x: (u8x16, u8x16, u8x16), y: u8x16) -> u8x16;
|
||||
fn aarch64_vqtbx3_s8(x: i8x8, y: (i8x16, i8x16, i8x16), z: u8x8) -> i8x8;
|
||||
fn aarch64_vqtbx3_u8(x: u8x8, y: (u8x16, u8x16, u8x16), z: u8x8) -> u8x8;
|
||||
fn aarch64_vqtbx3q_s8(x: i8x16, y: (i8x16, i8x16, i8x16), z: u8x16) -> i8x16;
|
||||
fn aarch64_vqtbx3q_u8(x: u8x16, y: (u8x16, u8x16, u8x16), z: u8x16) -> u8x16;
|
||||
fn aarch64_vqtbl4_s8(x: (i8x16, i8x16, i8x16, i8x16), y: u8x8) -> i8x8;
|
||||
fn aarch64_vqtbl4_u8(x: (u8x16, u8x16, u8x16, u8x16), y: u8x8) -> u8x8;
|
||||
fn aarch64_vqtbl4q_s8(x: (i8x16, i8x16, i8x16, i8x16), y: u8x16) -> i8x16;
|
||||
fn aarch64_vqtbl4q_u8(x: (u8x16, u8x16, u8x16, u8x16), y: u8x16) -> u8x16;
|
||||
fn aarch64_vqtbx4_s8(x: i8x8, y: (i8x16, i8x16, i8x16, i8x16), z: u8x8) -> i8x8;
|
||||
fn aarch64_vqtbx4_u8(x: u8x8, y: (u8x16, u8x16, u8x16, u8x16), z: u8x8) -> u8x8;
|
||||
fn aarch64_vqtbx4q_s8(x: i8x16, y: (i8x16, i8x16, i8x16, i8x16), z: u8x16) -> i8x16;
|
||||
fn aarch64_vqtbx4q_u8(x: u8x16, y: (u8x16, u8x16, u8x16, u8x16), z: u8x16) -> u8x16;
|
||||
}
|
||||
|
||||
pub trait Aarch64F32x4 {
|
||||
fn to_f64(self) -> f64x2;
|
||||
}
|
||||
impl Aarch64F32x4 for f32x4 {
|
||||
#[inline]
|
||||
fn to_f64(self) -> f64x2 {
|
||||
unsafe {
|
||||
simd_cast(f32x2(self.0, self.1))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Aarch64U8x16 {
|
||||
fn table_lookup_1(self, t0: u8x16) -> u8x16;
|
||||
}
|
||||
impl Aarch64U8x16 for u8x16 {
|
||||
#[inline]
|
||||
fn table_lookup_1(self, t0: u8x16) -> u8x16 {
|
||||
unsafe {aarch64_vqtbl1q_u8(t0, self)}
|
||||
}
|
||||
}
|
||||
pub trait Aarch64I8x16 {
|
||||
fn table_lookup_1(self, t0: i8x16) -> i8x16;
|
||||
}
|
||||
impl Aarch64I8x16 for i8x16 {
|
||||
#[inline]
|
||||
fn table_lookup_1(self, t0: i8x16) -> i8x16 {
|
||||
unsafe {aarch64_vqtbl2q_s8((t0, t0), ::bitcast(self))}
|
||||
}
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub mod common {
|
||||
use super::super::super::*;
|
||||
use std::mem;
|
||||
|
||||
#[inline]
|
||||
pub fn f32x4_sqrt(x: f32x4) -> f32x4 {
|
||||
unsafe {super::aarch64_vsqrtq_f32(x)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_approx_rsqrt(x: f32x4) -> f32x4 {
|
||||
unsafe {super::aarch64_vrsqrteq_f32(x)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_approx_reciprocal(x: f32x4) -> f32x4 {
|
||||
unsafe {super::aarch64_vrecpeq_f32(x)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_max(x: f32x4, y: f32x4) -> f32x4 {
|
||||
unsafe {super::aarch64_vmaxq_f32(x, y)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_min(x: f32x4, y: f32x4) -> f32x4 {
|
||||
unsafe {super::aarch64_vminq_f32(x, y)}
|
||||
}
|
||||
|
||||
macro_rules! bools {
|
||||
($($ty: ty, $all: ident ($min: ident), $any: ident ($max: ident);)*) => {
|
||||
$(
|
||||
#[inline]
|
||||
pub fn $all(x: $ty) -> bool {
|
||||
unsafe {
|
||||
super::$min(mem::transmute(x)) != 0
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
pub fn $any(x: $ty) -> bool {
|
||||
unsafe {
|
||||
super::$max(mem::transmute(x)) != 0
|
||||
}
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
bools! {
|
||||
bool32fx4, bool32fx4_all(aarch64_vminvq_u32), bool32fx4_any(aarch64_vmaxvq_u32);
|
||||
bool8ix16, bool8ix16_all(aarch64_vminvq_u8), bool8ix16_any(aarch64_vmaxvq_u8);
|
||||
bool16ix8, bool16ix8_all(aarch64_vminvq_u16), bool16ix8_any(aarch64_vmaxvq_u16);
|
||||
bool32ix4, bool32ix4_all(aarch64_vminvq_u32), bool32ix4_any(aarch64_vmaxvq_u32);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
//! Features specific to ARM CPUs.
|
||||
|
||||
#[cfg(any(feature = "doc", target_feature = "neon"))]
|
||||
pub mod neon;
|
|
@ -0,0 +1,530 @@
|
|||
use super::super::*;
|
||||
use sixty_four::{i64x2, u64x2};
|
||||
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct u32x2(u32, u32);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct i32x2(i32, i32);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct f32x2(f32, f32);
|
||||
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct u16x4(u16, u16, u16, u16);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct i16x4(i16, i16, i16, i16);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct u8x8(u8, u8, u8, u8,
|
||||
u8, u8, u8, u8);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct i8x8(i8, i8, i8, i8,
|
||||
i8, i8, i8, i8);
|
||||
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct i64x1(i64);
|
||||
#[repr(simd)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct u64x1(u64);
|
||||
|
||||
#[allow(dead_code)]
|
||||
extern "platform-intrinsic" {
|
||||
fn arm_vhadd_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vhadd_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vhadd_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vhadd_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vhadd_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vhadd_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vrhadd_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vrhadd_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vrhadd_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vrhadd_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vrhadd_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vrhadd_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vrhaddq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vrhaddq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vrhaddq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vrhaddq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vrhaddq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vrhaddq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vqadd_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vqadd_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vqadd_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vqadd_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vqadd_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vqadd_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vqadd_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn arm_vqadd_u64(x: u64x1, y: u64x1) -> u64x1;
|
||||
fn arm_vqaddq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vqaddq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vqaddq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vqaddq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vqaddq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vqaddq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vqaddq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn arm_vqaddq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn arm_vraddhn_s16(x: i16x8, y: i16x8) -> i8x8;
|
||||
fn arm_vraddhn_u16(x: u16x8, y: u16x8) -> u8x8;
|
||||
fn arm_vraddhn_s32(x: i32x4, y: i32x4) -> i16x4;
|
||||
fn arm_vraddhn_u32(x: u32x4, y: u32x4) -> u16x4;
|
||||
fn arm_vraddhn_s64(x: i64x2, y: i64x2) -> i32x2;
|
||||
fn arm_vraddhn_u64(x: u64x2, y: u64x2) -> u32x2;
|
||||
fn arm_vfma_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn arm_vfmaq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn arm_vqdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vqdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vqdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vqdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vqrdmulh_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vqrdmulh_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vqrdmulhq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vqrdmulhq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vmull_s8(x: i8x8, y: i8x8) -> i16x8;
|
||||
fn arm_vmull_u8(x: u8x8, y: u8x8) -> u16x8;
|
||||
fn arm_vmull_s16(x: i16x4, y: i16x4) -> i32x4;
|
||||
fn arm_vmull_u16(x: u16x4, y: u16x4) -> u32x4;
|
||||
fn arm_vmull_s32(x: i32x2, y: i32x2) -> i64x2;
|
||||
fn arm_vmull_u32(x: u32x2, y: u32x2) -> u64x2;
|
||||
fn arm_vqdmullq_s8(x: i8x8, y: i8x8) -> i16x8;
|
||||
fn arm_vqdmullq_s16(x: i16x4, y: i16x4) -> i32x4;
|
||||
fn arm_vhsub_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vhsub_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vhsub_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vhsub_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vhsub_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vhsub_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vhsubq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vhsubq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vhsubq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vhsubq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vhsubq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vhsubq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vqsub_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vqsub_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vqsub_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vqsub_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vqsub_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vqsub_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vqsub_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn arm_vqsub_u64(x: u64x1, y: u64x1) -> u64x1;
|
||||
fn arm_vqsubq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vqsubq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vqsubq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vqsubq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vqsubq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vqsubq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vqsubq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn arm_vqsubq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn arm_vrsubhn_s16(x: i16x8, y: i16x8) -> i8x8;
|
||||
fn arm_vrsubhn_u16(x: u16x8, y: u16x8) -> u8x8;
|
||||
fn arm_vrsubhn_s32(x: i32x4, y: i32x4) -> i16x4;
|
||||
fn arm_vrsubhn_u32(x: u32x4, y: u32x4) -> u16x4;
|
||||
fn arm_vrsubhn_s64(x: i64x2, y: i64x2) -> i32x2;
|
||||
fn arm_vrsubhn_u64(x: u64x2, y: u64x2) -> u32x2;
|
||||
fn arm_vabd_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vabd_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vabd_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vabd_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vabd_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vabd_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vabd_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn arm_vabdq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vabdq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vabdq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vabdq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vabdq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vabdq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vabdq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn arm_vmax_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vmax_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vmax_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vmax_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vmax_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vmax_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vmax_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn arm_vmaxq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vmaxq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vmaxq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vmaxq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vmaxq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vmaxq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vmaxq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn arm_vmin_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vmin_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vmin_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vmin_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vmin_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vmin_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vmin_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn arm_vminq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vminq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vminq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vminq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vminq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vminq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vminq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn arm_vshl_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vshl_u8(x: u8x8, y: i8x8) -> u8x8;
|
||||
fn arm_vshl_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vshl_u16(x: u16x4, y: i16x4) -> u16x4;
|
||||
fn arm_vshl_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vshl_u32(x: u32x2, y: i32x2) -> u32x2;
|
||||
fn arm_vshl_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn arm_vshl_u64(x: u64x1, y: i64x1) -> u64x1;
|
||||
fn arm_vshlq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vshlq_u8(x: u8x16, y: i8x16) -> u8x16;
|
||||
fn arm_vshlq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vshlq_u16(x: u16x8, y: i16x8) -> u16x8;
|
||||
fn arm_vshlq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vshlq_u32(x: u32x4, y: i32x4) -> u32x4;
|
||||
fn arm_vshlq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn arm_vshlq_u64(x: u64x2, y: i64x2) -> u64x2;
|
||||
fn arm_vqshl_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vqshl_u8(x: u8x8, y: i8x8) -> u8x8;
|
||||
fn arm_vqshl_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vqshl_u16(x: u16x4, y: i16x4) -> u16x4;
|
||||
fn arm_vqshl_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vqshl_u32(x: u32x2, y: i32x2) -> u32x2;
|
||||
fn arm_vqshl_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn arm_vqshl_u64(x: u64x1, y: i64x1) -> u64x1;
|
||||
fn arm_vqshlq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vqshlq_u8(x: u8x16, y: i8x16) -> u8x16;
|
||||
fn arm_vqshlq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vqshlq_u16(x: u16x8, y: i16x8) -> u16x8;
|
||||
fn arm_vqshlq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vqshlq_u32(x: u32x4, y: i32x4) -> u32x4;
|
||||
fn arm_vqshlq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn arm_vqshlq_u64(x: u64x2, y: i64x2) -> u64x2;
|
||||
fn arm_vrshl_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vrshl_u8(x: u8x8, y: i8x8) -> u8x8;
|
||||
fn arm_vrshl_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vrshl_u16(x: u16x4, y: i16x4) -> u16x4;
|
||||
fn arm_vrshl_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vrshl_u32(x: u32x2, y: i32x2) -> u32x2;
|
||||
fn arm_vrshl_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn arm_vrshl_u64(x: u64x1, y: i64x1) -> u64x1;
|
||||
fn arm_vrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
|
||||
fn arm_vrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
|
||||
fn arm_vrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
|
||||
fn arm_vrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn arm_vrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
|
||||
fn arm_vqrshl_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vqrshl_u8(x: u8x8, y: i8x8) -> u8x8;
|
||||
fn arm_vqrshl_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vqrshl_u16(x: u16x4, y: i16x4) -> u16x4;
|
||||
fn arm_vqrshl_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vqrshl_u32(x: u32x2, y: i32x2) -> u32x2;
|
||||
fn arm_vqrshl_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn arm_vqrshl_u64(x: u64x1, y: i64x1) -> u64x1;
|
||||
fn arm_vqrshlq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vqrshlq_u8(x: u8x16, y: i8x16) -> u8x16;
|
||||
fn arm_vqrshlq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vqrshlq_u16(x: u16x8, y: i16x8) -> u16x8;
|
||||
fn arm_vqrshlq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vqrshlq_u32(x: u32x4, y: i32x4) -> u32x4;
|
||||
fn arm_vqrshlq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn arm_vqrshlq_u64(x: u64x2, y: i64x2) -> u64x2;
|
||||
fn arm_vqshrun_n_s16(x: i16x8, y: u32) -> i8x8;
|
||||
fn arm_vqshrun_n_s32(x: i32x4, y: u32) -> i16x4;
|
||||
fn arm_vqshrun_n_s64(x: i64x2, y: u32) -> i32x2;
|
||||
fn arm_vqrshrun_n_s16(x: i16x8, y: u32) -> i8x8;
|
||||
fn arm_vqrshrun_n_s32(x: i32x4, y: u32) -> i16x4;
|
||||
fn arm_vqrshrun_n_s64(x: i64x2, y: u32) -> i32x2;
|
||||
fn arm_vqshrn_n_s16(x: i16x8, y: u32) -> i8x8;
|
||||
fn arm_vqshrn_n_u16(x: u16x8, y: u32) -> u8x8;
|
||||
fn arm_vqshrn_n_s32(x: i32x4, y: u32) -> i16x4;
|
||||
fn arm_vqshrn_n_u32(x: u32x4, y: u32) -> u16x4;
|
||||
fn arm_vqshrn_n_s64(x: i64x2, y: u32) -> i32x2;
|
||||
fn arm_vqshrn_n_u64(x: u64x2, y: u32) -> u32x2;
|
||||
fn arm_vrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
|
||||
fn arm_vrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
|
||||
fn arm_vrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
|
||||
fn arm_vrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
|
||||
fn arm_vrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
|
||||
fn arm_vrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
|
||||
fn arm_vqrshrn_n_s16(x: i16x8, y: u32) -> i8x8;
|
||||
fn arm_vqrshrn_n_u16(x: u16x8, y: u32) -> u8x8;
|
||||
fn arm_vqrshrn_n_s32(x: i32x4, y: u32) -> i16x4;
|
||||
fn arm_vqrshrn_n_u32(x: u32x4, y: u32) -> u16x4;
|
||||
fn arm_vqrshrn_n_s64(x: i64x2, y: u32) -> i32x2;
|
||||
fn arm_vqrshrn_n_u64(x: u64x2, y: u32) -> u32x2;
|
||||
fn arm_vsri_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vsri_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vsri_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vsri_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vsri_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vsri_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vsri_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn arm_vsri_u64(x: u64x1, y: u64x1) -> u64x1;
|
||||
fn arm_vsriq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vsriq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vsriq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vsriq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vsriq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vsriq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vsriq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn arm_vsriq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn arm_vsli_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vsli_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vsli_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vsli_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vsli_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vsli_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vsli_s64(x: i64x1, y: i64x1) -> i64x1;
|
||||
fn arm_vsli_u64(x: u64x1, y: u64x1) -> u64x1;
|
||||
fn arm_vsliq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vsliq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vsliq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vsliq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vsliq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vsliq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vsliq_s64(x: i64x2, y: i64x2) -> i64x2;
|
||||
fn arm_vsliq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn arm_vvqmovn_s16(x: i16x8) -> i8x8;
|
||||
fn arm_vvqmovn_u16(x: u16x8) -> u8x8;
|
||||
fn arm_vvqmovn_s32(x: i32x4) -> i16x4;
|
||||
fn arm_vvqmovn_u32(x: u32x4) -> u16x4;
|
||||
fn arm_vvqmovn_s64(x: i64x2) -> i32x2;
|
||||
fn arm_vvqmovn_u64(x: u64x2) -> u32x2;
|
||||
fn arm_vabs_s8(x: i8x8) -> i8x8;
|
||||
fn arm_vabs_s16(x: i16x4) -> i16x4;
|
||||
fn arm_vabs_s32(x: i32x2) -> i32x2;
|
||||
fn arm_vabsq_s8(x: i8x16) -> i8x16;
|
||||
fn arm_vabsq_s16(x: i16x8) -> i16x8;
|
||||
fn arm_vabsq_s32(x: i32x4) -> i32x4;
|
||||
fn arm_vabs_f32(x: f32x2) -> f32x2;
|
||||
fn arm_vabsq_f32(x: f32x4) -> f32x4;
|
||||
fn arm_vqabs_s8(x: i8x8) -> i8x8;
|
||||
fn arm_vqabs_s16(x: i16x4) -> i16x4;
|
||||
fn arm_vqabs_s32(x: i32x2) -> i32x2;
|
||||
fn arm_vqabsq_s8(x: i8x16) -> i8x16;
|
||||
fn arm_vqabsq_s16(x: i16x8) -> i16x8;
|
||||
fn arm_vqabsq_s32(x: i32x4) -> i32x4;
|
||||
fn arm_vqneg_s8(x: i8x8) -> i8x8;
|
||||
fn arm_vqneg_s16(x: i16x4) -> i16x4;
|
||||
fn arm_vqneg_s32(x: i32x2) -> i32x2;
|
||||
fn arm_vqnegq_s8(x: i8x16) -> i8x16;
|
||||
fn arm_vqnegq_s16(x: i16x8) -> i16x8;
|
||||
fn arm_vqnegq_s32(x: i32x4) -> i32x4;
|
||||
fn arm_vclz_s8(x: i8x8) -> i8x8;
|
||||
fn arm_vclz_u8(x: u8x8) -> u8x8;
|
||||
fn arm_vclz_s16(x: i16x4) -> i16x4;
|
||||
fn arm_vclz_u16(x: u16x4) -> u16x4;
|
||||
fn arm_vclz_s32(x: i32x2) -> i32x2;
|
||||
fn arm_vclz_u32(x: u32x2) -> u32x2;
|
||||
fn arm_vclzq_s8(x: i8x16) -> i8x16;
|
||||
fn arm_vclzq_u8(x: u8x16) -> u8x16;
|
||||
fn arm_vclzq_s16(x: i16x8) -> i16x8;
|
||||
fn arm_vclzq_u16(x: u16x8) -> u16x8;
|
||||
fn arm_vclzq_s32(x: i32x4) -> i32x4;
|
||||
fn arm_vclzq_u32(x: u32x4) -> u32x4;
|
||||
fn arm_vcls_s8(x: i8x8) -> i8x8;
|
||||
fn arm_vcls_u8(x: u8x8) -> u8x8;
|
||||
fn arm_vcls_s16(x: i16x4) -> i16x4;
|
||||
fn arm_vcls_u16(x: u16x4) -> u16x4;
|
||||
fn arm_vcls_s32(x: i32x2) -> i32x2;
|
||||
fn arm_vcls_u32(x: u32x2) -> u32x2;
|
||||
fn arm_vclsq_s8(x: i8x16) -> i8x16;
|
||||
fn arm_vclsq_u8(x: u8x16) -> u8x16;
|
||||
fn arm_vclsq_s16(x: i16x8) -> i16x8;
|
||||
fn arm_vclsq_u16(x: u16x8) -> u16x8;
|
||||
fn arm_vclsq_s32(x: i32x4) -> i32x4;
|
||||
fn arm_vclsq_u32(x: u32x4) -> u32x4;
|
||||
fn arm_vcnt_s8(x: i8x8) -> i8x8;
|
||||
fn arm_vcnt_u8(x: u8x8) -> u8x8;
|
||||
fn arm_vcntq_s8(x: i8x16) -> i8x16;
|
||||
fn arm_vcntq_u8(x: u8x16) -> u8x16;
|
||||
fn arm_vrecpe_u32(x: u32x2) -> u32x2;
|
||||
fn arm_vrecpe_f32(x: f32x2) -> f32x2;
|
||||
fn arm_vrecpeq_u32(x: u32x4) -> u32x4;
|
||||
fn arm_vrecpeq_f32(x: f32x4) -> f32x4;
|
||||
fn arm_vrecps_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn arm_vrecpsq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn arm_vsqrt_f32(x: f32x2) -> f32x2;
|
||||
fn arm_vsqrtq_f32(x: f32x4) -> f32x4;
|
||||
fn arm_vrsqrte_u32(x: u32x2) -> u32x2;
|
||||
fn arm_vrsqrte_f32(x: f32x2) -> f32x2;
|
||||
fn arm_vrsqrteq_u32(x: u32x4) -> u32x4;
|
||||
fn arm_vrsqrteq_f32(x: f32x4) -> f32x4;
|
||||
fn arm_vrsqrts_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn arm_vrsqrtsq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn arm_vbsl_s8(x: u8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vbsl_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vbsl_s16(x: u16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vbsl_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vbsl_s32(x: u32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vbsl_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vbsl_s64(x: u64x1, y: i64x1) -> i64x1;
|
||||
fn arm_vbsl_u64(x: u64x1, y: u64x1) -> u64x1;
|
||||
fn arm_vbslq_s8(x: u8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vbslq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vbslq_s16(x: u16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vbslq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vbslq_s32(x: u32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vbslq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vbslq_s64(x: u64x2, y: i64x2) -> i64x2;
|
||||
fn arm_vbslq_u64(x: u64x2, y: u64x2) -> u64x2;
|
||||
fn arm_vpadd_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vpadd_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vpadd_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vpadd_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vpadd_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vpadd_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vpadd_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn arm_vpaddl_s16(x: i8x8) -> i16x4;
|
||||
fn arm_vpaddl_u16(x: u8x8) -> u16x4;
|
||||
fn arm_vpaddl_s32(x: i16x4) -> i32x2;
|
||||
fn arm_vpaddl_u32(x: u16x4) -> u32x2;
|
||||
fn arm_vpaddl_s64(x: i32x2) -> i64x1;
|
||||
fn arm_vpaddl_u64(x: u32x2) -> u64x1;
|
||||
fn arm_vpaddlq_s16(x: i8x16) -> i16x8;
|
||||
fn arm_vpaddlq_u16(x: u8x16) -> u16x8;
|
||||
fn arm_vpaddlq_s32(x: i16x8) -> i32x4;
|
||||
fn arm_vpaddlq_u32(x: u16x8) -> u32x4;
|
||||
fn arm_vpaddlq_s64(x: i32x4) -> i64x2;
|
||||
fn arm_vpaddlq_u64(x: u32x4) -> u64x2;
|
||||
fn arm_vpadal_s16(x: i16x4, y: i8x8) -> i16x4;
|
||||
fn arm_vpadal_u16(x: u16x4, y: u8x8) -> u16x4;
|
||||
fn arm_vpadal_s32(x: i32x2, y: i16x4) -> i32x2;
|
||||
fn arm_vpadal_u32(x: u32x2, y: u16x4) -> u32x2;
|
||||
fn arm_vpadal_s64(x: i64x1, y: i32x2) -> i64x1;
|
||||
fn arm_vpadal_u64(x: u64x1, y: u32x2) -> u64x1;
|
||||
fn arm_vpadalq_s16(x: i16x8, y: i8x16) -> i16x8;
|
||||
fn arm_vpadalq_u16(x: u16x8, y: u8x16) -> u16x8;
|
||||
fn arm_vpadalq_s32(x: i32x4, y: i16x8) -> i32x4;
|
||||
fn arm_vpadalq_u32(x: u32x4, y: u16x8) -> u32x4;
|
||||
fn arm_vpadalq_s64(x: i64x2, y: i32x4) -> i64x2;
|
||||
fn arm_vpadalq_u64(x: u64x2, y: u32x4) -> u64x2;
|
||||
fn arm_vpmax_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vpmax_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vpmax_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vpmax_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vpmax_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vpmax_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vpmax_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn arm_vpmin_s8(x: i8x8, y: i8x8) -> i8x8;
|
||||
fn arm_vpmin_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vpmin_s16(x: i16x4, y: i16x4) -> i16x4;
|
||||
fn arm_vpmin_u16(x: u16x4, y: u16x4) -> u16x4;
|
||||
fn arm_vpmin_s32(x: i32x2, y: i32x2) -> i32x2;
|
||||
fn arm_vpmin_u32(x: u32x2, y: u32x2) -> u32x2;
|
||||
fn arm_vpmin_f32(x: f32x2, y: f32x2) -> f32x2;
|
||||
fn arm_vpminq_s8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn arm_vpminq_u8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn arm_vpminq_s16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn arm_vpminq_u16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn arm_vpminq_s32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn arm_vpminq_u32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn arm_vpminq_f32(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn arm_vtbl1_s8(x: i8x8, y: u8x8) -> i8x8;
|
||||
fn arm_vtbl1_u8(x: u8x8, y: u8x8) -> u8x8;
|
||||
fn arm_vtbx1_s8(x: i8x8, y: i8x8, z: u8x8) -> i8x8;
|
||||
fn arm_vtbx1_u8(x: u8x8, y: u8x8, z: u8x8) -> u8x8;
|
||||
fn arm_vtbl2_s8(x: (i8x8, i8x8), y: u8x8) -> i8x8;
|
||||
fn arm_vtbl2_u8(x: (u8x8, u8x8), y: u8x8) -> u8x8;
|
||||
fn arm_vtbx2_s8(x: (i8x8, i8x8), y: u8x8) -> i8x8;
|
||||
fn arm_vtbx2_u8(x: (u8x8, u8x8), y: u8x8) -> u8x8;
|
||||
fn arm_vtbl3_s8(x: (i8x8, i8x8, i8x8), y: u8x8) -> i8x8;
|
||||
fn arm_vtbl3_u8(x: (u8x8, u8x8, u8x8), y: u8x8) -> u8x8;
|
||||
fn arm_vtbx3_s8(x: i8x8, y: (i8x8, i8x8, i8x8), z: u8x8) -> i8x8;
|
||||
fn arm_vtbx3_u8(x: u8x8, y: (u8x8, u8x8, u8x8), z: u8x8) -> u8x8;
|
||||
fn arm_vtbl4_s8(x: (i8x8, i8x8, i8x8, i8x8), y: u8x8) -> i8x8;
|
||||
fn arm_vtbl4_u8(x: (u8x8, u8x8, u8x8, u8x8), y: u8x8) -> u8x8;
|
||||
fn arm_vtbx4_s8(x: i8x8, y: (i8x8, i8x8, i8x8, i8x8), z: u8x8) -> i8x8;
|
||||
fn arm_vtbx4_u8(x: u8x8, y: (u8x8, u8x8, u8x8, u8x8), z: u8x8) -> u8x8;
|
||||
}
|
||||
|
||||
|
||||
impl u8x8 {
|
||||
#[inline]
|
||||
pub fn table_lookup_1(self, t0: u8x8) -> u8x8 {
|
||||
unsafe {arm_vtbl1_u8(t0, self)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn table_lookup_2(self, t0: u8x8, t1: u8x8) -> u8x8 {
|
||||
unsafe {arm_vtbl2_u8((t0, t1), self)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn table_lookup_3(self, t0: u8x8, t1: u8x8, t2: u8x8) -> u8x8 {
|
||||
unsafe {arm_vtbl3_u8((t0, t1, t2), self)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn table_lookup_4(self, t0: u8x8, t1: u8x8, t2: u8x8, t3: u8x8) -> u8x8 {
|
||||
unsafe {arm_vtbl4_u8((t0, t1, t2, t3), self)}
|
||||
}
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub mod common {
|
||||
use super::super::super::*;
|
||||
use super::*;
|
||||
use std::mem;
|
||||
|
||||
#[inline]
|
||||
pub fn f32x4_sqrt(x: f32x4) -> f32x4 {
|
||||
unsafe {super::arm_vsqrtq_f32(x)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_approx_rsqrt(x: f32x4) -> f32x4 {
|
||||
unsafe {super::arm_vrsqrteq_f32(x)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_approx_reciprocal(x: f32x4) -> f32x4 {
|
||||
unsafe {super::arm_vrecpeq_f32(x)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_max(x: f32x4, y: f32x4) -> f32x4 {
|
||||
unsafe {super::arm_vmaxq_f32(x, y)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_min(x: f32x4, y: f32x4) -> f32x4 {
|
||||
unsafe {super::arm_vminq_f32(x, y)}
|
||||
}
|
||||
|
||||
macro_rules! bools {
|
||||
($($ty: ty, $half: ty, $all: ident ($min: ident), $any: ident ($max: ident);)*) => {
|
||||
$(
|
||||
#[inline]
|
||||
pub fn $all(x: $ty) -> bool {
|
||||
unsafe {
|
||||
let (lo, hi): ($half, $half) = mem::transmute(x);
|
||||
let x = super::$min(lo, hi);
|
||||
let y = super::$min(x, mem::uninitialized());
|
||||
y.0 != 0
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
pub fn $any(x: $ty) -> bool {
|
||||
unsafe {
|
||||
let (lo, hi): ($half, $half) = mem::transmute(x);
|
||||
let x = super::$max(lo, hi);
|
||||
let y = super::$max(x, mem::uninitialized());
|
||||
y.0 != 0
|
||||
}
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
bools! {
|
||||
bool32fx4, arm::neon::u32x2, bool32fx4_all(arm_vpmin_u32), bool32fx4_any(arm_vpmax_u32);
|
||||
bool8ix16, arm::neon::u8x8, bool8ix16_all(arm_vpmin_u8), bool8ix16_any(arm_vpmax_u8);
|
||||
bool16ix8, arm::neon::u16x4, bool16ix8_all(arm_vpmin_u16), bool16ix8_any(arm_vpmax_u16);
|
||||
bool32ix4, arm::neon::u32x2, bool32ix4_all(arm_vpmin_u32), bool32ix4_any(arm_vpmax_u32);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,521 @@
|
|||
use super::*;
|
||||
#[allow(unused_imports)]
|
||||
use super::{
|
||||
simd_eq, simd_ne, simd_lt, simd_le, simd_gt, simd_ge,
|
||||
simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
|
||||
simd_insert, simd_extract,
|
||||
simd_cast,
|
||||
simd_add, simd_sub, simd_mul, simd_div, simd_shl, simd_shr, simd_and, simd_or, simd_xor,
|
||||
|
||||
Unalign, bitcast,
|
||||
};
|
||||
use std::mem;
|
||||
use std::ops;
|
||||
|
||||
#[cfg(any(target_arch = "x86",
|
||||
target_arch = "x86_64"))]
|
||||
use x86::sse2::common;
|
||||
#[cfg(any(target_arch = "arm"))]
|
||||
use arm::neon::common;
|
||||
#[cfg(any(target_arch = "aarch64"))]
|
||||
use aarch64::neon::common;
|
||||
|
||||
macro_rules! basic_impls {
|
||||
($(
|
||||
$name: ident:
|
||||
$elem: ident, $bool: ident, $shuffle: ident, $length: expr, $($first: ident),* | $($last: ident),*;
|
||||
)*) => {
|
||||
$(impl $name {
|
||||
/// Create a new instance.
|
||||
#[inline]
|
||||
pub const fn new($($first: $elem),*, $($last: $elem),*) -> $name {
|
||||
$name($($first),*, $($last),*)
|
||||
}
|
||||
|
||||
/// Create a new instance where every lane has value `x`.
|
||||
#[inline]
|
||||
pub const fn splat(x: $elem) -> $name {
|
||||
$name($({ #[allow(dead_code)] struct $first; x }),*,
|
||||
$({ #[allow(dead_code)] struct $last; x }),*)
|
||||
}
|
||||
|
||||
/// Compare for equality.
|
||||
#[inline]
|
||||
pub fn eq(self, other: Self) -> $bool {
|
||||
unsafe {simd_eq(self, other)}
|
||||
}
|
||||
/// Compare for equality.
|
||||
#[inline]
|
||||
pub fn ne(self, other: Self) -> $bool {
|
||||
unsafe {simd_ne(self, other)}
|
||||
}
|
||||
/// Compare for equality.
|
||||
#[inline]
|
||||
pub fn lt(self, other: Self) -> $bool {
|
||||
unsafe {simd_lt(self, other)}
|
||||
}
|
||||
/// Compare for equality.
|
||||
#[inline]
|
||||
pub fn le(self, other: Self) -> $bool {
|
||||
unsafe {simd_le(self, other)}
|
||||
}
|
||||
/// Compare for equality.
|
||||
#[inline]
|
||||
pub fn gt(self, other: Self) -> $bool {
|
||||
unsafe {simd_gt(self, other)}
|
||||
}
|
||||
/// Compare for equality.
|
||||
#[inline]
|
||||
pub fn ge(self, other: Self) -> $bool {
|
||||
unsafe {simd_ge(self, other)}
|
||||
}
|
||||
|
||||
/// Extract the value of the `idx`th lane of `self`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// `extract` will panic if `idx` is out of bounds.
|
||||
#[inline]
|
||||
pub fn extract(self, idx: u32) -> $elem {
|
||||
assert!(idx < $length);
|
||||
unsafe {simd_extract(self, idx)}
|
||||
}
|
||||
/// Return a new vector where the `idx`th lane is replaced
|
||||
/// by `elem`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// `replace` will panic if `idx` is out of bounds.
|
||||
#[inline]
|
||||
pub fn replace(self, idx: u32, elem: $elem) -> Self {
|
||||
assert!(idx < $length);
|
||||
unsafe {simd_insert(self, idx, elem)}
|
||||
}
|
||||
|
||||
/// Load a new value from the `idx`th position of `array`.
|
||||
///
|
||||
/// This is equivalent to the following, but is possibly
|
||||
/// more efficient:
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// Self::new(array[idx], array[idx + 1], ...)
|
||||
/// ```
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// `load` will panic if `idx` is out of bounds in
|
||||
/// `array`, or if `array[idx..]` is too short.
|
||||
#[inline]
|
||||
pub fn load(array: &[$elem], idx: usize) -> Self {
|
||||
let data = &array[idx..idx + $length];
|
||||
let loaded = unsafe {
|
||||
*(data.as_ptr() as *const Unalign<Self>)
|
||||
};
|
||||
loaded.0
|
||||
}
|
||||
|
||||
/// Store the elements of `self` to `array`, starting at
|
||||
/// the `idx`th position.
|
||||
///
|
||||
/// This is equivalent to the following, but is possibly
|
||||
/// more efficient:
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// array[i] = self.extract(0);
|
||||
/// array[i + 1] = self.extract(1);
|
||||
/// // ...
|
||||
/// ```
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// `store` will panic if `idx` is out of bounds in
|
||||
/// `array`, or if `array[idx...]` is too short.
|
||||
#[inline]
|
||||
pub fn store(self, array: &mut [$elem], idx: usize) {
|
||||
let place = &mut array[idx..idx + $length];
|
||||
unsafe {
|
||||
*(place.as_mut_ptr() as *mut Unalign<Self>) = Unalign(self)
|
||||
}
|
||||
}
|
||||
})*
|
||||
}
|
||||
}
|
||||
|
||||
basic_impls! {
|
||||
u32x4: u32, bool32ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
|
||||
i32x4: i32, bool32ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
|
||||
f32x4: f32, bool32fx4, simd_shuffle4, 4, x0, x1 | x2, x3;
|
||||
|
||||
u16x8: u16, bool16ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
|
||||
i16x8: i16, bool16ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
|
||||
|
||||
u8x16: u8, bool8ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
i8x16: i8, bool8ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
}
|
||||
|
||||
macro_rules! bool_impls {
|
||||
($(
|
||||
$name: ident:
|
||||
$elem: ident, $repr: ident, $repr_elem: ident, $length: expr, $all: ident, $any: ident,
|
||||
$($first: ident),* | $($last: ident),*
|
||||
[$(#[$cvt_meta: meta] $cvt: ident -> $cvt_to: ident),*];
|
||||
)*) => {
|
||||
$(impl $name {
|
||||
/// Convert to integer representation.
|
||||
#[inline]
|
||||
pub fn to_repr(self) -> $repr {
|
||||
unsafe {mem::transmute(self)}
|
||||
}
|
||||
/// Convert from integer representation.
|
||||
#[inline]
|
||||
#[inline]
|
||||
pub fn from_repr(x: $repr) -> Self {
|
||||
unsafe {mem::transmute(x)}
|
||||
}
|
||||
|
||||
/// Create a new instance.
|
||||
#[inline]
|
||||
pub fn new($($first: bool),*, $($last: bool),*) -> $name {
|
||||
unsafe {
|
||||
// negate everything together
|
||||
simd_sub($name::splat(false),
|
||||
$name($( ($first as $repr_elem) ),*,
|
||||
$( ($last as $repr_elem) ),*))
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new instance where every lane has value `x`.
|
||||
#[allow(unused_variables)]
|
||||
#[inline]
|
||||
pub fn splat(x: bool) -> $name {
|
||||
let x = if x {!(0 as $repr_elem)} else {0};
|
||||
$name($({ let $first = (); x}),*,
|
||||
$({ let $last = (); x}),*)
|
||||
}
|
||||
|
||||
/// Extract the value of the `idx`th lane of `self`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// `extract` will panic if `idx` is out of bounds.
|
||||
#[inline]
|
||||
pub fn extract(self, idx: u32) -> bool {
|
||||
assert!(idx < $length);
|
||||
unsafe {simd_extract(self.to_repr(), idx) != 0}
|
||||
}
|
||||
/// Return a new vector where the `idx`th lane is replaced
|
||||
/// by `elem`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// `replace` will panic if `idx` is out of bounds.
|
||||
#[inline]
|
||||
pub fn replace(self, idx: u32, elem: bool) -> Self {
|
||||
assert!(idx < $length);
|
||||
let x = if elem {!(0 as $repr_elem)} else {0};
|
||||
unsafe {Self::from_repr(simd_insert(self.to_repr(), idx, x))}
|
||||
}
|
||||
/// Select between elements of `then` and `else_`, based on
|
||||
/// the corresponding element of `self`.
|
||||
///
|
||||
/// This is equivalent to the following, but is possibly
|
||||
/// more efficient:
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// T::new(if self.extract(0) { then.extract(0) } else { else_.extract(0) },
|
||||
/// if self.extract(1) { then.extract(1) } else { else_.extract(1) },
|
||||
/// ...)
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn select<T: Simd<Bool = $name>>(self, then: T, else_: T) -> T {
|
||||
let then: $repr = bitcast(then);
|
||||
let else_: $repr = bitcast(else_);
|
||||
bitcast((then & self.to_repr()) | (else_ & (!self).to_repr()))
|
||||
}
|
||||
|
||||
/// Check if every element of `self` is true.
|
||||
///
|
||||
/// This is equivalent to the following, but is possibly
|
||||
/// more efficient:
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// self.extract(0) && self.extract(1) && ...
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn all(self) -> bool {
|
||||
common::$all(self)
|
||||
}
|
||||
/// Check if any element of `self` is true.
|
||||
///
|
||||
/// This is equivalent to the following, but is possibly
|
||||
/// more efficient:
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// self.extract(0) || self.extract(1) || ...
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn any(self) -> bool {
|
||||
common::$any(self)
|
||||
}
|
||||
|
||||
$(
|
||||
#[$cvt_meta]
|
||||
#[inline]
|
||||
pub fn $cvt(self) -> $cvt_to {
|
||||
bitcast(self)
|
||||
}
|
||||
)*
|
||||
}
|
||||
impl ops::Not for $name {
|
||||
type Output = Self;
|
||||
|
||||
#[inline]
|
||||
fn not(self) -> Self {
|
||||
Self::from_repr($repr::splat(!(0 as $repr_elem)) ^ self.to_repr())
|
||||
}
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
bool_impls! {
|
||||
bool32ix4: bool32i, i32x4, i32, 4, bool32ix4_all, bool32ix4_any, x0, x1 | x2, x3
|
||||
[/// Convert `self` to a boolean vector for interacting with floating point vectors.
|
||||
to_f -> bool32fx4];
|
||||
bool32fx4: bool32f, i32x4, i32, 4, bool32fx4_all, bool32fx4_any, x0, x1 | x2, x3
|
||||
[/// Convert `self` to a boolean vector for interacting with integer vectors.
|
||||
to_i -> bool32ix4];
|
||||
|
||||
bool16ix8: bool16i, i16x8, i16, 8, bool16ix8_all, bool16ix8_any, x0, x1, x2, x3 | x4, x5, x6, x7 [];
|
||||
|
||||
bool8ix16: bool8i, i8x16, i8, 16, bool8ix16_all, bool8ix16_any, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15 [];
|
||||
}
|
||||
|
||||
impl u32x4 {
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i32(self) -> i32x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to a 32-bit float.
|
||||
#[inline]
|
||||
pub fn to_f32(self) -> f32x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
impl i32x4 {
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u32(self) -> u32x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to a 32-bit float.
|
||||
#[inline]
|
||||
pub fn to_f32(self) -> f32x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
impl f32x4 {
|
||||
/// Compute the square root of each lane.
|
||||
#[inline]
|
||||
pub fn sqrt(self) -> Self {
|
||||
common::f32x4_sqrt(self)
|
||||
}
|
||||
/// Compute an approximation to the reciprocal of the square root
|
||||
/// of `self`, that is, `f32::splat(1.0) / self.sqrt()`.
|
||||
///
|
||||
/// The accuracy of this approximation is platform dependent.
|
||||
#[inline]
|
||||
pub fn approx_rsqrt(self) -> Self {
|
||||
common::f32x4_approx_rsqrt(self)
|
||||
}
|
||||
/// Compute an approximation to the reciprocal of `self`, that is,
|
||||
/// `f32::splat(1.0) / self`.
|
||||
///
|
||||
/// The accuracy of this approximation is platform dependent.
|
||||
#[inline]
|
||||
pub fn approx_reciprocal(self) -> Self {
|
||||
common::f32x4_approx_reciprocal(self)
|
||||
}
|
||||
/// Compute the lane-wise maximum of `self` and `other`.
|
||||
///
|
||||
/// This is equivalent to the following, but is possibly more
|
||||
/// efficient:
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// f32x4::new(self.extract(0).max(other.extract(0)),
|
||||
/// self.extract(1).max(other.extract(1)),
|
||||
/// ...)
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn max(self, other: Self) -> Self {
|
||||
common::f32x4_max(self, other)
|
||||
}
|
||||
/// Compute the lane-wise minimum of `self` and `other`.
|
||||
///
|
||||
/// This is equivalent to the following, but is possibly more
|
||||
/// efficient:
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// f32x4::new(self.extract(0).min(other.extract(0)),
|
||||
/// self.extract(1).min(other.extract(1)),
|
||||
/// ...)
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn min(self, other: Self) -> Self {
|
||||
common::f32x4_min(self, other)
|
||||
}
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i32(self) -> i32x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u32(self) -> u32x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl i16x8 {
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u16(self) -> u16x8 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
impl u16x8 {
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i16(self) -> i16x8 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl i8x16 {
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u8(self) -> u8x16 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
impl u8x16 {
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i8(self) -> i8x16 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
macro_rules! neg_impls {
|
||||
($zero: expr, $($ty: ident,)*) => {
|
||||
$(impl ops::Neg for $ty {
|
||||
type Output = Self;
|
||||
fn neg(self) -> Self {
|
||||
$ty::splat($zero) - self
|
||||
}
|
||||
})*
|
||||
}
|
||||
}
|
||||
neg_impls!{
|
||||
0,
|
||||
i32x4,
|
||||
i16x8,
|
||||
i8x16,
|
||||
}
|
||||
neg_impls! {
|
||||
0.0,
|
||||
f32x4,
|
||||
}
|
||||
macro_rules! not_impls {
|
||||
($($ty: ident,)*) => {
|
||||
$(impl ops::Not for $ty {
|
||||
type Output = Self;
|
||||
fn not(self) -> Self {
|
||||
$ty::splat(!0) ^ self
|
||||
}
|
||||
})*
|
||||
}
|
||||
}
|
||||
not_impls! {
|
||||
i32x4,
|
||||
i16x8,
|
||||
i8x16,
|
||||
u32x4,
|
||||
u16x8,
|
||||
u8x16,
|
||||
}
|
||||
|
||||
macro_rules! operators {
|
||||
($($trayt: ident ($func: ident, $method: ident): $($ty: ty),*;)*) => {
|
||||
$(
|
||||
$(impl ops::$trayt for $ty {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn $method(self, x: Self) -> Self {
|
||||
unsafe {$func(self, x)}
|
||||
}
|
||||
})*
|
||||
)*
|
||||
}
|
||||
}
|
||||
operators! {
|
||||
Add (simd_add, add):
|
||||
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
|
||||
f32x4;
|
||||
Sub (simd_sub, sub):
|
||||
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
|
||||
f32x4;
|
||||
Mul (simd_mul, mul):
|
||||
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
|
||||
f32x4;
|
||||
Div (simd_div, div): f32x4;
|
||||
|
||||
BitAnd (simd_and, bitand):
|
||||
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
|
||||
bool8ix16, bool16ix8, bool32ix4,
|
||||
bool32fx4;
|
||||
BitOr (simd_or, bitor):
|
||||
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
|
||||
bool8ix16, bool16ix8, bool32ix4,
|
||||
bool32fx4;
|
||||
BitXor (simd_xor, bitxor):
|
||||
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4,
|
||||
bool8ix16, bool16ix8, bool32ix4,
|
||||
bool32fx4;
|
||||
}
|
||||
|
||||
macro_rules! shift_one {
|
||||
($ty: ident, $($by: ident),*) => {
|
||||
$(
|
||||
impl ops::Shl<$by> for $ty {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn shl(self, other: $by) -> Self {
|
||||
unsafe { simd_shl(self, $ty::splat(other as <$ty as Simd>::Elem)) }
|
||||
}
|
||||
}
|
||||
impl ops::Shr<$by> for $ty {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn shr(self, other: $by) -> Self {
|
||||
unsafe {simd_shr(self, $ty::splat(other as <$ty as Simd>::Elem))}
|
||||
}
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! shift {
|
||||
($($ty: ident),*) => {
|
||||
$(shift_one! {
|
||||
$ty,
|
||||
u8, u16, u32, u64, usize,
|
||||
i8, i16, i32, i64, isize
|
||||
})*
|
||||
}
|
||||
}
|
||||
shift! {
|
||||
i8x16, u8x16, i16x8, u16x8, i32x4, u32x4
|
||||
}
|
|
@ -0,0 +1,225 @@
|
|||
//! `simd` offers a basic interface to the SIMD functionality of CPUs.
|
||||
|
||||
#![feature(cfg_target_feature, repr_simd, platform_intrinsics, const_fn)]
|
||||
#![allow(non_camel_case_types)]
|
||||
|
||||
#[cfg(feature = "with-serde")]
|
||||
extern crate serde;
|
||||
#[cfg(feature = "with-serde")]
|
||||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
|
||||
/// Boolean type for 8-bit integers.
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct bool8i(i8);
|
||||
/// Boolean type for 16-bit integers.
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct bool16i(i16);
|
||||
/// Boolean type for 32-bit integers.
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct bool32i(i32);
|
||||
/// Boolean type for 32-bit floats.
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct bool32f(i32);
|
||||
|
||||
macro_rules! bool {
|
||||
($($name: ident, $inner: ty;)*) => {
|
||||
$(
|
||||
impl From<bool> for $name {
|
||||
#[inline]
|
||||
fn from(b: bool) -> $name {
|
||||
$name(-(b as $inner))
|
||||
}
|
||||
}
|
||||
impl From<$name> for bool {
|
||||
#[inline]
|
||||
fn from(b: $name) -> bool {
|
||||
b.0 != 0
|
||||
}
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
bool! {
|
||||
bool8i, i8;
|
||||
bool16i, i16;
|
||||
bool32i, i32;
|
||||
bool32f, i32;
|
||||
}
|
||||
|
||||
/// Types that are SIMD vectors.
|
||||
pub unsafe trait Simd {
|
||||
/// The corresponding boolean vector type.
|
||||
type Bool: Simd;
|
||||
/// The element that this vector stores.
|
||||
type Elem;
|
||||
}
|
||||
|
||||
/// A SIMD vector of 4 `u32`s.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct u32x4(u32, u32, u32, u32);
|
||||
/// A SIMD vector of 4 `i32`s.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct i32x4(i32, i32, i32, i32);
|
||||
/// A SIMD vector of 4 `f32`s.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct f32x4(f32, f32, f32, f32);
|
||||
/// A SIMD boolean vector for length-4 vectors of 32-bit integers.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool32ix4(i32, i32, i32, i32);
|
||||
/// A SIMD boolean vector for length-4 vectors of 32-bit floats.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool32fx4(i32, i32, i32, i32);
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
struct u32x2(u32, u32);
|
||||
#[allow(dead_code)]
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
struct i32x2(i32, i32);
|
||||
#[allow(dead_code)]
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
struct f32x2(f32, f32);
|
||||
#[allow(dead_code)]
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
struct bool32ix2(i32, i32);
|
||||
#[allow(dead_code)]
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
struct bool32fx2(i32, i32);
|
||||
|
||||
/// A SIMD vector of 8 `u16`s.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct u16x8(u16, u16, u16, u16,
|
||||
u16, u16, u16, u16);
|
||||
/// A SIMD vector of 8 `i16`s.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct i16x8(i16, i16, i16, i16,
|
||||
i16, i16, i16, i16);
|
||||
/// A SIMD boolean vector for length-8 vectors of 16-bit integers.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool16ix8(i16, i16, i16, i16,
|
||||
i16, i16, i16, i16);
|
||||
|
||||
/// A SIMD vector of 16 `u8`s.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct u8x16(u8, u8, u8, u8, u8, u8, u8, u8,
|
||||
u8, u8, u8, u8, u8, u8, u8, u8);
|
||||
/// A SIMD vector of 16 `i8`s.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct i8x16(i8, i8, i8, i8, i8, i8, i8, i8,
|
||||
i8, i8, i8, i8, i8, i8, i8, i8);
|
||||
/// A SIMD boolean vector for length-16 vectors of 8-bit integers.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool8ix16(i8, i8, i8, i8, i8, i8, i8, i8,
|
||||
i8, i8, i8, i8, i8, i8, i8, i8);
|
||||
|
||||
|
||||
macro_rules! simd {
|
||||
($($bool: ty: $($ty: ty = $elem: ty),*;)*) => {
|
||||
$($(unsafe impl Simd for $ty {
|
||||
type Bool = $bool;
|
||||
type Elem = $elem;
|
||||
}
|
||||
impl Clone for $ty { #[inline] fn clone(&self) -> Self { *self } }
|
||||
)*)*}
|
||||
}
|
||||
simd! {
|
||||
bool8ix16: i8x16 = i8, u8x16 = u8, bool8ix16 = bool8i;
|
||||
bool16ix8: i16x8 = i16, u16x8 = u16, bool16ix8 = bool16i;
|
||||
bool32ix4: i32x4 = i32, u32x4 = u32, bool32ix4 = bool32i;
|
||||
bool32fx4: f32x4 = f32, bool32fx4 = bool32f;
|
||||
|
||||
bool32ix2: i32x2 = i32, u32x2 = u32, bool32ix2 = bool32i;
|
||||
bool32fx2: f32x2 = f32, bool32fx2 = bool32f;
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[inline]
|
||||
fn bitcast<T: Simd, U: Simd>(x: T) -> U {
|
||||
assert_eq!(std::mem::size_of::<T>(),
|
||||
std::mem::size_of::<U>());
|
||||
unsafe {std::mem::transmute_copy(&x)}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
extern "platform-intrinsic" {
|
||||
fn simd_eq<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
|
||||
fn simd_ne<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
|
||||
fn simd_lt<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
|
||||
fn simd_le<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
|
||||
fn simd_gt<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
|
||||
fn simd_ge<T: Simd<Bool = U>, U>(x: T, y: T) -> U;
|
||||
|
||||
fn simd_shuffle2<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 2]) -> U;
|
||||
fn simd_shuffle4<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 4]) -> U;
|
||||
fn simd_shuffle8<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 8]) -> U;
|
||||
fn simd_shuffle16<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 16]) -> U;
|
||||
|
||||
fn simd_insert<T: Simd<Elem = U>, U>(x: T, idx: u32, val: U) -> T;
|
||||
fn simd_extract<T: Simd<Elem = U>, U>(x: T, idx: u32) -> U;
|
||||
|
||||
fn simd_cast<T: Simd, U: Simd>(x: T) -> U;
|
||||
|
||||
fn simd_add<T: Simd>(x: T, y: T) -> T;
|
||||
fn simd_sub<T: Simd>(x: T, y: T) -> T;
|
||||
fn simd_mul<T: Simd>(x: T, y: T) -> T;
|
||||
fn simd_div<T: Simd>(x: T, y: T) -> T;
|
||||
fn simd_shl<T: Simd>(x: T, y: T) -> T;
|
||||
fn simd_shr<T: Simd>(x: T, y: T) -> T;
|
||||
fn simd_and<T: Simd>(x: T, y: T) -> T;
|
||||
fn simd_or<T: Simd>(x: T, y: T) -> T;
|
||||
fn simd_xor<T: Simd>(x: T, y: T) -> T;
|
||||
}
|
||||
#[repr(packed)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Unalign<T>(T);
|
||||
|
||||
#[macro_use]
|
||||
mod common;
|
||||
mod sixty_four;
|
||||
mod v256;
|
||||
|
||||
#[cfg(any(feature = "doc",
|
||||
target_arch = "x86",
|
||||
target_arch = "x86_64"))]
|
||||
pub mod x86;
|
||||
#[cfg(any(feature = "doc", target_arch = "arm"))]
|
||||
pub mod arm;
|
||||
#[cfg(any(feature = "doc", target_arch = "aarch64"))]
|
||||
pub mod aarch64;
|
|
@ -0,0 +1,229 @@
|
|||
#![allow(dead_code)]
|
||||
use super::*;
|
||||
#[allow(unused_imports)]
|
||||
use super::{
|
||||
f32x2,
|
||||
simd_eq, simd_ne, simd_lt, simd_le, simd_gt, simd_ge,
|
||||
simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
|
||||
simd_insert, simd_extract,
|
||||
simd_cast,
|
||||
simd_add, simd_sub, simd_mul, simd_div, simd_shl, simd_shr, simd_and, simd_or, simd_xor,
|
||||
|
||||
Unalign, bitcast,
|
||||
};
|
||||
use std::mem;
|
||||
use std::ops;
|
||||
|
||||
/// Boolean type for 64-bit integers.
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct bool64i(i64);
|
||||
/// Boolean type for 64-bit floats.
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct bool64f(i64);
|
||||
/// A SIMD vector of 2 `u64`s.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct u64x2(u64, u64);
|
||||
/// A SIMD vector of 2 `i64`s.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct i64x2(i64, i64);
|
||||
/// A SIMD vector of 2 `f64`s.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct f64x2(f64, f64);
|
||||
/// A SIMD boolean vector for length-2 vectors of 64-bit integers.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool64ix2(i64, i64);
|
||||
/// A SIMD boolean vector for length-2 vectors of 64-bit floats.
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool64fx2(i64, i64);
|
||||
|
||||
simd! {
|
||||
bool64ix2: i64x2 = i64, u64x2 = u64, bool64ix2 = bool64i;
|
||||
bool64fx2: f64x2 = f64, bool64fx2 = bool64f;
|
||||
}
|
||||
basic_impls! {
|
||||
u64x2: u64, bool64ix2, simd_shuffle2, 2, x0 | x1;
|
||||
i64x2: i64, bool64ix2, simd_shuffle2, 2, x0 | x1;
|
||||
f64x2: f64, bool64fx2, simd_shuffle2, 2, x0 | x1;
|
||||
}
|
||||
|
||||
mod common {
|
||||
use super::*;
|
||||
// naive for now
|
||||
#[inline]
|
||||
pub fn bool64ix2_all(x: bool64ix2) -> bool {
|
||||
x.0 != 0 && x.1 != 0
|
||||
}
|
||||
#[inline]
|
||||
pub fn bool64ix2_any(x: bool64ix2) -> bool {
|
||||
x.0 != 0 || x.1 != 0
|
||||
}
|
||||
#[inline]
|
||||
pub fn bool64fx2_all(x: bool64fx2) -> bool {
|
||||
x.0 != 0 && x.1 != 0
|
||||
}
|
||||
#[inline]
|
||||
pub fn bool64fx2_any(x: bool64fx2) -> bool {
|
||||
x.0 != 0 || x.1 != 0
|
||||
}}
|
||||
bool_impls! {
|
||||
bool64ix2: bool64i, i64x2, i64, 2, bool64ix2_all, bool64ix2_any, x0 | x1
|
||||
[/// Convert `self` to a boolean vector for interacting with floating point vectors.
|
||||
to_f -> bool64fx2];
|
||||
|
||||
bool64fx2: bool64f, i64x2, i64, 2, bool64fx2_all, bool64fx2_any, x0 | x1
|
||||
[/// Convert `self` to a boolean vector for interacting with integer vectors.
|
||||
to_i -> bool64ix2];
|
||||
}
|
||||
|
||||
impl u64x2 {
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i64(self) -> i64x2 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to a 64-bit float.
|
||||
#[inline]
|
||||
pub fn to_f64(self) -> f64x2 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
impl i64x2 {
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u64(self) -> u64x2 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to a 64-bit float.
|
||||
#[inline]
|
||||
pub fn to_f64(self) -> f64x2 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
impl f64x2 {
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i64(self) -> i64x2 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u64(self) -> u64x2 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
|
||||
/// Convert each lane to a 32-bit float.
|
||||
#[inline]
|
||||
pub fn to_f32(self) -> f32x4 {
|
||||
unsafe {
|
||||
let x: f32x2 = simd_cast(self);
|
||||
f32x4::new(x.0, x.1, 0.0, 0.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neg_impls!{
|
||||
0,
|
||||
i64x2,
|
||||
}
|
||||
neg_impls! {
|
||||
0.0,
|
||||
f64x2,
|
||||
}
|
||||
macro_rules! not_impls {
|
||||
($($ty: ident,)*) => {
|
||||
$(impl ops::Not for $ty {
|
||||
type Output = Self;
|
||||
fn not(self) -> Self {
|
||||
$ty::splat(!0) ^ self
|
||||
}
|
||||
})*
|
||||
}
|
||||
}
|
||||
not_impls! {
|
||||
i64x2,
|
||||
u64x2,
|
||||
}
|
||||
|
||||
macro_rules! operators {
|
||||
($($trayt: ident ($func: ident, $method: ident): $($ty: ty),*;)*) => {
|
||||
$(
|
||||
$(impl ops::$trayt for $ty {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn $method(self, x: Self) -> Self {
|
||||
unsafe {$func(self, x)}
|
||||
}
|
||||
})*
|
||||
)*
|
||||
}
|
||||
}
|
||||
operators! {
|
||||
Add (simd_add, add):
|
||||
i64x2, u64x2,
|
||||
f64x2;
|
||||
Sub (simd_sub, sub):
|
||||
i64x2, u64x2,
|
||||
f64x2;
|
||||
Mul (simd_mul, mul):
|
||||
i64x2, u64x2,
|
||||
f64x2;
|
||||
Div (simd_div, div): f64x2;
|
||||
|
||||
BitAnd (simd_and, bitand):
|
||||
i64x2, u64x2,
|
||||
bool64ix2,
|
||||
bool64fx2;
|
||||
BitOr (simd_or, bitor):
|
||||
i64x2, u64x2,
|
||||
bool64ix2,
|
||||
bool64fx2;
|
||||
BitXor (simd_xor, bitxor):
|
||||
i64x2, u64x2,
|
||||
bool64ix2,
|
||||
bool64fx2;
|
||||
}
|
||||
|
||||
macro_rules! shift_one { ($ty: ident, $($by: ident),*) => {
|
||||
$(
|
||||
impl ops::Shl<$by> for $ty {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn shl(self, other: $by) -> Self {
|
||||
unsafe { simd_shl(self, $ty::splat(other as <$ty as Simd>::Elem)) }
|
||||
}
|
||||
}
|
||||
impl ops::Shr<$by> for $ty {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn shr(self, other: $by) -> Self {
|
||||
unsafe {simd_shr(self, $ty::splat(other as <$ty as Simd>::Elem))}
|
||||
}
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! shift {
|
||||
($($ty: ident),*) => {
|
||||
$(shift_one! {
|
||||
$ty,
|
||||
u8, u16, u32, u64, usize,
|
||||
i8, i16, i32, i64, isize
|
||||
})*
|
||||
}
|
||||
}
|
||||
shift! {
|
||||
i64x2, u64x2
|
||||
}
|
|
@ -0,0 +1,424 @@
|
|||
#![allow(dead_code)]
|
||||
use std::ops;
|
||||
use std::mem;
|
||||
#[allow(unused_imports)]
|
||||
use super::{
|
||||
Simd,
|
||||
u32x4, i32x4, u16x8, i16x8, u8x16, i8x16, f32x4,
|
||||
bool32ix4, bool16ix8, bool8ix16, bool32fx4,
|
||||
simd_eq, simd_ne, simd_lt, simd_le, simd_gt, simd_ge,
|
||||
simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
|
||||
simd_insert, simd_extract,
|
||||
simd_cast,
|
||||
simd_add, simd_sub, simd_mul, simd_div, simd_shl, simd_shr, simd_and, simd_or, simd_xor,
|
||||
bool8i, bool16i, bool32i, bool32f,
|
||||
Unalign, bitcast,
|
||||
};
|
||||
use super::sixty_four::*;
|
||||
#[cfg(all(target_feature = "avx"))]
|
||||
use super::x86::avx::common;
|
||||
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct u64x4(u64, u64, u64, u64);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct i64x4(i64, i64, i64, i64);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct f64x4(f64, f64, f64, f64);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool64ix4(i64, i64, i64, i64);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool64fx4(i64, i64, i64, i64);
|
||||
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct u32x8(u32, u32, u32, u32,
|
||||
u32, u32, u32, u32);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct i32x8(i32, i32, i32, i32,
|
||||
i32, i32, i32, i32);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct f32x8(f32, f32, f32, f32,
|
||||
f32, f32, f32, f32);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool32ix8(i32, i32, i32, i32,
|
||||
i32, i32, i32, i32);#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool32fx8(i32, i32, i32, i32,
|
||||
i32, i32, i32, i32);
|
||||
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct u16x16(u16, u16, u16, u16, u16, u16, u16, u16,
|
||||
u16, u16, u16, u16, u16, u16, u16, u16);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct i16x16(i16, i16, i16, i16, i16, i16, i16, i16,
|
||||
i16, i16, i16, i16, i16, i16, i16, i16);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool16ix16(i16, i16, i16, i16, i16, i16, i16, i16,
|
||||
i16, i16, i16, i16, i16, i16, i16, i16);
|
||||
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct u8x32(u8, u8, u8, u8, u8, u8, u8, u8,
|
||||
u8, u8, u8, u8, u8, u8, u8, u8,
|
||||
u8, u8, u8, u8, u8, u8, u8, u8,
|
||||
u8, u8, u8, u8, u8, u8, u8, u8);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct i8x32(i8, i8, i8, i8, i8, i8, i8, i8,
|
||||
i8, i8, i8, i8, i8, i8, i8, i8,
|
||||
i8, i8, i8, i8, i8, i8, i8, i8,
|
||||
i8, i8, i8, i8, i8, i8, i8, i8);
|
||||
#[repr(simd)]
|
||||
#[cfg_attr(feature = "with-serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Copy)]
|
||||
pub struct bool8ix32(i8, i8, i8, i8, i8, i8, i8, i8,
|
||||
i8, i8, i8, i8, i8, i8, i8, i8,
|
||||
i8, i8, i8, i8, i8, i8, i8, i8,
|
||||
i8, i8, i8, i8, i8, i8, i8, i8);
|
||||
|
||||
simd! {
|
||||
bool8ix32: i8x32 = i8, u8x32 = u8, bool8ix32 = bool8i;
|
||||
bool16ix16: i16x16 = i16, u16x16 = u16, bool16ix16 = bool16i;
|
||||
bool32ix8: i32x8 = i32, u32x8 = u32, bool32ix8 = bool32i;
|
||||
bool64ix4: i64x4 = i64, u64x4 = u64, bool64ix4 = bool64i;
|
||||
|
||||
bool32fx8: f32x8 = f32, bool32fx8 = bool32f;
|
||||
bool64fx4: f64x4 = f64, bool64fx4 = bool64f;
|
||||
}
|
||||
|
||||
basic_impls! {
|
||||
u64x4: u64, bool64ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
|
||||
i64x4: i64, bool64ix4, simd_shuffle4, 4, x0, x1 | x2, x3;
|
||||
f64x4: f64, bool64fx4, simd_shuffle4, 4, x0, x1 | x2, x3;
|
||||
|
||||
u32x8: u32, bool32ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
|
||||
i32x8: i32, bool32ix8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
|
||||
f32x8: f32, bool32fx8, simd_shuffle8, 8, x0, x1, x2, x3 | x4, x5, x6, x7;
|
||||
|
||||
u16x16: u16, bool16ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
i16x16: i16, bool16ix16, simd_shuffle16, 16, x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
|
||||
u8x32: u8, bool8ix32, simd_shuffle32, 32, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
|
||||
x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31;
|
||||
i8x32: i8, bool8ix32, simd_shuffle32, 32, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
|
||||
x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31;
|
||||
}
|
||||
|
||||
#[cfg(all(not(target_feature = "avx")))]
|
||||
#[doc(hidden)]
|
||||
mod common {
|
||||
use super::*;
|
||||
// implementation via SSE vectors
|
||||
macro_rules! bools {
|
||||
($($ty: ty, $all: ident, $any: ident;)*) => {
|
||||
$(
|
||||
#[inline]
|
||||
pub fn $all(x: $ty) -> bool {
|
||||
x.low().all() && x.high().all()
|
||||
}
|
||||
#[inline]
|
||||
pub fn $any(x: $ty) -> bool {
|
||||
x.low().any() || x.high().any()
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
bools! {
|
||||
bool64ix4, bool64ix4_all, bool64ix4_any;
|
||||
bool64fx4, bool64fx4_all, bool64fx4_any;
|
||||
bool32ix8, bool32ix8_all, bool32ix8_any;
|
||||
bool32fx8, bool32fx8_all, bool32fx8_any;
|
||||
bool16ix16, bool16ix16_all, bool16ix16_any;
|
||||
bool8ix32, bool8ix32_all, bool8ix32_any;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool_impls! {
|
||||
bool64ix4: bool64i, i64x4, i64, 4, bool64ix4_all, bool64ix4_any, x0, x1 | x2, x3
|
||||
[/// Convert `self` to a boolean vector for interacting with floating point vectors.
|
||||
to_f -> bool64fx4];
|
||||
|
||||
bool64fx4: bool64f, i64x4, i64, 4, bool64fx4_all, bool64fx4_any, x0, x1 | x2, x3
|
||||
[/// Convert `self` to a boolean vector for interacting with integer vectors.
|
||||
to_i -> bool64ix4];
|
||||
|
||||
bool32ix8: bool32i, i32x8, i32, 8, bool32ix8_all, bool32ix8_any, x0, x1, x2, x3 | x4, x5, x6, x7
|
||||
[/// Convert `self` to a boolean vector for interacting with floating point vectors.
|
||||
to_f -> bool32fx8];
|
||||
|
||||
bool32fx8: bool32f, i32x8, i32, 8, bool32fx8_all, bool32fx8_any, x0, x1, x2, x3 | x4, x5, x6, x7
|
||||
[/// Convert `self` to a boolean vector for interacting with integer vectors.
|
||||
to_i -> bool32ix8];
|
||||
|
||||
bool16ix16: bool16i, i16x16, i16, 16, bool16ix16_all, bool16ix16_any,
|
||||
x0, x1, x2, x3, x4, x5, x6, x7 | x8, x9, x10, x11, x12, x13, x14, x15 [];
|
||||
|
||||
bool8ix32: bool8i, i8x32, i8, 32, bool8ix32_all, bool8ix32_any,
|
||||
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
|
||||
x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 [];
|
||||
}
|
||||
|
||||
pub trait LowHigh128 {
|
||||
type Half: Simd;
|
||||
/// Extract the low 128 bit part.
|
||||
fn low(self) -> Self::Half;
|
||||
/// Extract the high 128 bit part.
|
||||
fn high(self) -> Self::Half;
|
||||
}
|
||||
|
||||
macro_rules! expr { ($x:expr) => ($x) } // HACK
|
||||
macro_rules! low_high_impls {
|
||||
($(
|
||||
$name: ident, $half: ident, $($first: tt),+ ... $($last: tt),+;
|
||||
)*) => {
|
||||
$(impl LowHigh128 for $name {
|
||||
type Half = $half;
|
||||
#[inline]
|
||||
fn low(self) -> Self::Half {
|
||||
$half::new($( expr!(self.$first), )*)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn high(self) -> Self::Half {
|
||||
$half::new($( expr!(self.$last), )*)
|
||||
}
|
||||
})*
|
||||
}
|
||||
}
|
||||
|
||||
low_high_impls! {
|
||||
u64x4, u64x2, 0, 1 ... 2, 3;
|
||||
i64x4, i64x2, 0, 1 ... 2, 3;
|
||||
f64x4, f64x2, 0, 1 ... 2, 3;
|
||||
|
||||
u32x8, u32x4, 0, 1, 2, 3 ... 4, 5, 6, 7;
|
||||
i32x8, i32x4, 0, 1, 2, 3 ... 4, 5, 6, 7;
|
||||
f32x8, f32x4, 0, 1, 2, 3 ... 4, 5, 6, 7;
|
||||
|
||||
u16x16, u16x8, 0, 1, 2, 3, 4, 5, 6, 7 ... 8, 9, 10, 11, 12, 13, 14, 15;
|
||||
i16x16, i16x8, 0, 1, 2, 3, 4, 5, 6, 7 ... 8, 9, 10, 11, 12, 13, 14, 15;
|
||||
|
||||
u8x32, u8x16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ...
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31;
|
||||
i8x32, i8x16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ...
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31;
|
||||
|
||||
}
|
||||
|
||||
macro_rules! bool_low_high_impls {
|
||||
($(
|
||||
$name: ident: $half: ident;
|
||||
)*) => {
|
||||
$(impl LowHigh128 for $name {
|
||||
type Half = $half;
|
||||
/// Extract the low 128 bit part.
|
||||
#[inline]
|
||||
fn low(self) -> Self::Half {
|
||||
Self::Half::from_repr(self.to_repr().low())
|
||||
}
|
||||
|
||||
/// Extract the high 128 bit part.
|
||||
#[inline]
|
||||
fn high(self) -> Self::Half {
|
||||
Self::Half::from_repr(self.to_repr().high())
|
||||
}
|
||||
})*
|
||||
}
|
||||
}
|
||||
|
||||
bool_low_high_impls! {
|
||||
bool64fx4: bool64fx2;
|
||||
bool32fx8: bool32fx4;
|
||||
|
||||
bool64ix4: bool64ix2;
|
||||
bool32ix8: bool32ix4;
|
||||
bool16ix16: bool16ix8;
|
||||
bool8ix32: bool8ix16;
|
||||
}
|
||||
|
||||
impl u64x4 {
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i64(self) -> i64x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to a 64-bit float.
|
||||
#[inline]
|
||||
pub fn to_f64(self) -> f64x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl i64x4 {
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u64(self) -> u64x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to a 64-bit float.
|
||||
#[inline]
|
||||
pub fn to_f64(self) -> f64x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl f64x4 {
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i64(self) -> i64x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u64(self) -> u64x4 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl u32x8 {
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i32(self) -> i32x8 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to a 32-bit float.
|
||||
#[inline]
|
||||
pub fn to_f32(self) -> f32x8 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl i32x8 {
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u32(self) -> u32x8 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
/// Convert each lane to a 32-bit float.
|
||||
#[inline]
|
||||
pub fn to_f32(self) -> f32x8 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl i16x16 {
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u16(self) -> u16x16 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl u16x16 {
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i16(self) -> i16x16 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl i8x32 {
|
||||
/// Convert each lane to an unsigned integer.
|
||||
#[inline]
|
||||
pub fn to_u8(self) -> u8x32 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl u8x32 {
|
||||
/// Convert each lane to a signed integer.
|
||||
#[inline]
|
||||
pub fn to_i8(self) -> i8x32 {
|
||||
unsafe {simd_cast(self)}
|
||||
}
|
||||
}
|
||||
|
||||
operators! {
|
||||
Add (simd_add, add):
|
||||
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
|
||||
f64x4, f32x8;
|
||||
Sub (simd_sub, sub):
|
||||
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
|
||||
f64x4, f32x8;
|
||||
Mul (simd_mul, mul):
|
||||
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
|
||||
f64x4, f32x8;
|
||||
Div (simd_div, div): f64x4, f32x8;
|
||||
|
||||
BitAnd (simd_and, bitand):
|
||||
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
|
||||
bool64ix4, bool32ix8, bool16ix16,
|
||||
bool64fx4, bool32fx8;
|
||||
BitOr (simd_or, bitor):
|
||||
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
|
||||
bool64ix4, bool32ix8, bool16ix16,
|
||||
bool64fx4, bool32fx8;
|
||||
BitXor (simd_xor, bitxor):
|
||||
i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, i64x4, u64x4,
|
||||
bool64ix4, bool32ix8, bool16ix16,
|
||||
bool64fx4, bool32fx8;
|
||||
}
|
||||
|
||||
neg_impls!{
|
||||
0,
|
||||
i64x4,
|
||||
i32x8,
|
||||
i16x16,
|
||||
i8x32,
|
||||
}
|
||||
|
||||
neg_impls! {
|
||||
0.0,
|
||||
f64x4,
|
||||
f32x8,
|
||||
}
|
||||
|
||||
not_impls! {
|
||||
i64x4,
|
||||
u64x4,
|
||||
i32x8,
|
||||
u32x8,
|
||||
i16x16,
|
||||
u16x16,
|
||||
i8x32,
|
||||
u8x32,
|
||||
}
|
||||
|
||||
shift! {
|
||||
i64x4,
|
||||
u64x4,
|
||||
i32x8,
|
||||
u32x8,
|
||||
i16x16,
|
||||
u16x16,
|
||||
i8x32,
|
||||
u8x32
|
||||
}
|
|
@ -0,0 +1,290 @@
|
|||
use super::super::*;
|
||||
use sixty_four::*;
|
||||
|
||||
use super::super::bitcast;
|
||||
|
||||
pub use v256::{
|
||||
f64x4, bool64fx4, u64x4, i64x4, bool64ix4,
|
||||
f32x8, bool32fx8, u32x8, i32x8, bool32ix8,
|
||||
u16x16, i16x16, bool16ix16,
|
||||
u8x32, i8x32, bool8ix32,
|
||||
LowHigh128
|
||||
};
|
||||
|
||||
#[allow(dead_code)]
|
||||
extern "platform-intrinsic" {
|
||||
fn x86_mm256_addsub_ps(x: f32x8, y: f32x8) -> f32x8;
|
||||
fn x86_mm256_addsub_pd(x: f64x4, y: f64x4) -> f64x4;
|
||||
fn x86_mm256_dp_ps(x: f32x8, y: f32x8, z: i32) -> f32x8;
|
||||
fn x86_mm256_hadd_ps(x: f32x8, y: f32x8) -> f32x8;
|
||||
fn x86_mm256_hadd_pd(x: f64x4, y: f64x4) -> f64x4;
|
||||
fn x86_mm256_hsub_ps(x: f32x8, y: f32x8) -> f32x8;
|
||||
fn x86_mm256_hsub_pd(x: f64x4, y: f64x4) -> f64x4;
|
||||
fn x86_mm256_max_ps(x: f32x8, y: f32x8) -> f32x8;
|
||||
fn x86_mm256_max_pd(x: f64x4, y: f64x4) -> f64x4;
|
||||
fn x86_mm256_min_ps(x: f32x8, y: f32x8) -> f32x8;
|
||||
fn x86_mm256_min_pd(x: f64x4, y: f64x4) -> f64x4;
|
||||
fn x86_mm256_movemask_ps(x: f32x8) -> i32;
|
||||
fn x86_mm256_movemask_pd(x: f64x4) -> i32;
|
||||
fn x86_mm_permutevar_ps(x: f32x4, y: i32x4) -> f32x4;
|
||||
fn x86_mm_permutevar_pd(x: f64x2, y: i64x2) -> f64x2;
|
||||
fn x86_mm256_permutevar_ps(x: f32x8, y: i32x8) -> f32x8;
|
||||
fn x86_mm256_permutevar_pd(x: f64x4, y: i64x4) -> f64x4;
|
||||
fn x86_mm256_rcp_ps(x: f32x8) -> f32x8;
|
||||
fn x86_mm256_rsqrt_ps(x: f32x8) -> f32x8;
|
||||
fn x86_mm256_sqrt_ps(x: f32x8) -> f32x8;
|
||||
fn x86_mm256_sqrt_pd(x: f64x4) -> f64x4;
|
||||
fn x86_mm_testc_ps(x: f32x4, y: f32x4) -> i32;
|
||||
fn x86_mm256_testc_ps(x: f32x8, y: f32x8) -> i32;
|
||||
fn x86_mm_testc_pd(x: f64x2, y: f64x2) -> i32;
|
||||
fn x86_mm256_testc_pd(x: f64x4, y: f64x4) -> i32;
|
||||
fn x86_mm256_testc_si256(x: u64x4, y: u64x4) -> i32;
|
||||
fn x86_mm_testnzc_ps(x: f32x4, y: f32x4) -> i32;
|
||||
fn x86_mm256_testnzc_ps(x: f32x8, y: f32x8) -> i32;
|
||||
fn x86_mm_testnzc_pd(x: f64x2, y: f64x2) -> i32;
|
||||
fn x86_mm256_testnzc_pd(x: f64x4, y: f64x4) -> i32;
|
||||
fn x86_mm256_testnzc_si256(x: u64x4, y: u64x4) -> i32;
|
||||
fn x86_mm_testz_ps(x: f32x4, y: f32x4) -> i32;
|
||||
fn x86_mm256_testz_ps(x: f32x8, y: f32x8) -> i32;
|
||||
fn x86_mm_testz_pd(x: f64x2, y: f64x2) -> i32;
|
||||
fn x86_mm256_testz_pd(x: f64x4, y: f64x4) -> i32;
|
||||
fn x86_mm256_testz_si256(x: u64x4, y: u64x4) -> i32;
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub mod common {
|
||||
use super::*;
|
||||
use std::mem;
|
||||
|
||||
macro_rules! bools {
|
||||
($($ty: ty, $all: ident, $any: ident, $testc: ident, $testz: ident;)*) => {
|
||||
$(
|
||||
#[inline]
|
||||
pub fn $all(x: $ty) -> bool {
|
||||
unsafe {
|
||||
super::$testc(mem::transmute(x), mem::transmute(<$ty>::splat(true))) != 0
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
pub fn $any(x: $ty) -> bool {
|
||||
unsafe {
|
||||
super::$testz(mem::transmute(x), mem::transmute(x)) == 0
|
||||
}
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
bools! {
|
||||
bool32fx8, bool32fx8_all, bool32fx8_any, x86_mm256_testc_ps, x86_mm256_testz_ps;
|
||||
bool64fx4, bool64fx4_all, bool64fx4_any, x86_mm256_testc_pd, x86_mm256_testz_pd;
|
||||
bool8ix32, bool8ix32_all, bool8ix32_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
|
||||
bool16ix16, bool16ix16_all, bool16ix16_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
|
||||
bool32ix8, bool32ix8_all, bool32ix8_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
|
||||
bool64ix4, bool64ix4_all, bool64ix4_any, x86_mm256_testc_si256, x86_mm256_testz_si256;
|
||||
}
|
||||
}
|
||||
|
||||
// 128-bit vectors:
|
||||
|
||||
// 32 bit floats
|
||||
|
||||
pub trait AvxF32x4 {
|
||||
fn permutevar(self, other: i32x4) -> f32x4;
|
||||
}
|
||||
impl AvxF32x4 for f32x4 {
|
||||
fn permutevar(self, other: i32x4) -> f32x4 {
|
||||
unsafe { x86_mm_permutevar_ps(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait AvxF64x4 {
|
||||
fn sqrt(self) -> Self;
|
||||
fn addsub(self, other: Self) -> Self;
|
||||
fn hadd(self, other: Self) -> Self;
|
||||
fn hsub(self, other: Self) -> Self;
|
||||
fn max(self, other: Self) -> Self;
|
||||
fn min(self, other: Self) -> Self;
|
||||
fn move_mask(self) -> u32;
|
||||
}
|
||||
|
||||
impl AvxF64x4 for f64x4 {
|
||||
#[inline]
|
||||
fn sqrt(self) -> Self {
|
||||
unsafe { x86_mm256_sqrt_pd(self) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn addsub(self, other: Self) -> Self {
|
||||
unsafe { x86_mm256_addsub_pd(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hadd(self, other: Self) -> Self {
|
||||
unsafe { x86_mm256_hadd_pd(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hsub(self, other: Self) -> Self {
|
||||
unsafe { x86_mm256_hsub_pd(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max(self, other: Self) -> Self {
|
||||
unsafe { x86_mm256_max_pd(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min(self, other: Self) -> Self {
|
||||
unsafe { x86_mm256_min_pd(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe { x86_mm256_movemask_pd(self) as u32 }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait AvxBool64fx4 {
|
||||
fn move_mask(self) -> u32;
|
||||
}
|
||||
impl AvxBool64fx4 for bool64fx4 {
|
||||
#[inline]
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe { x86_mm256_movemask_pd(bitcast(self)) as u32 }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait AvxF32x8 {
|
||||
fn sqrt(self) -> Self;
|
||||
fn addsub(self, other: Self) -> Self;
|
||||
fn hadd(self, other: Self) -> Self;
|
||||
fn hsub(self, other: Self) -> Self;
|
||||
fn max(self, other: Self) -> Self;
|
||||
fn min(self, other: Self) -> Self;
|
||||
fn move_mask(self) -> u32;
|
||||
/// Compute an approximation to the reciprocal of the square root
|
||||
/// of `self`, that is, `f32x8::splat(1.0) / self.sqrt()`.
|
||||
///
|
||||
/// The accuracy of this approximation is platform dependent.
|
||||
fn approx_rsqrt(self) -> Self;
|
||||
/// Compute an approximation to the reciprocal of `self`, that is,
|
||||
/// `f32x8::splat(1.0) / self`.
|
||||
///
|
||||
/// The accuracy of this approximation is platform dependent.
|
||||
fn approx_reciprocal(self) -> Self;
|
||||
}
|
||||
|
||||
impl AvxF32x8 for f32x8 {
|
||||
#[inline]
|
||||
fn sqrt(self) -> Self {
|
||||
unsafe { x86_mm256_sqrt_ps(self) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn addsub(self, other: Self) -> Self {
|
||||
unsafe { x86_mm256_addsub_ps(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hadd(self, other: Self) -> Self {
|
||||
unsafe { x86_mm256_hadd_ps(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hsub(self, other: Self) -> Self {
|
||||
unsafe { x86_mm256_hsub_ps(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max(self, other: Self) -> Self {
|
||||
unsafe { x86_mm256_max_ps(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min(self, other: Self) -> Self {
|
||||
unsafe { x86_mm256_min_ps(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe { x86_mm256_movemask_ps(self) as u32 }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn approx_reciprocal(self) -> Self {
|
||||
unsafe { x86_mm256_rcp_ps(self) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn approx_rsqrt(self) -> Self {
|
||||
unsafe { x86_mm256_rsqrt_ps(self) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait AvxBool32fx8 {
|
||||
fn move_mask(self) -> u32;
|
||||
}
|
||||
impl AvxBool32fx8 for bool32fx8 {
|
||||
#[inline]
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe { x86_mm256_movemask_ps(bitcast(self)) as u32 }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait AvxBool32fx4 {}
|
||||
impl AvxBool32fx4 for bool32fx4 {}
|
||||
|
||||
// 64 bit floats
|
||||
|
||||
pub trait AvxF64x2 {
|
||||
fn permutevar(self, other: i64x2) -> f64x2;
|
||||
}
|
||||
impl AvxF64x2 for f64x2 {
|
||||
fn permutevar(self, other: i64x2) -> f64x2 {
|
||||
unsafe { x86_mm_permutevar_pd(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait AvxBool64fx2 {}
|
||||
impl AvxBool64fx2 for bool64fx2 {}
|
||||
|
||||
// 64 bit integers
|
||||
|
||||
pub trait AvxU64x2 {}
|
||||
impl AvxU64x2 for u64x2 {}
|
||||
pub trait AvxI64x2 {}
|
||||
impl AvxI64x2 for i64x2 {}
|
||||
|
||||
pub trait AvxBool64ix2 {}
|
||||
impl AvxBool64ix2 for bool64ix2 {}
|
||||
|
||||
// 32 bit integers
|
||||
|
||||
pub trait AvxU32x4 {}
|
||||
impl AvxU32x4 for u32x4 {}
|
||||
pub trait AvxI32x4 {}
|
||||
impl AvxI32x4 for i32x4 {}
|
||||
|
||||
pub trait AvxBool32ix4 {}
|
||||
impl AvxBool32ix4 for bool32ix4 {}
|
||||
|
||||
// 16 bit integers
|
||||
|
||||
pub trait AvxU16x8 {}
|
||||
impl AvxU16x8 for u16x8 {}
|
||||
pub trait AvxI16x8 {}
|
||||
impl AvxI16x8 for i16x8 {}
|
||||
|
||||
pub trait AvxBool16ix8 {}
|
||||
impl AvxBool16ix8 for bool16ix8 {}
|
||||
|
||||
// 8 bit integers
|
||||
|
||||
pub trait AvxU8x16 {}
|
||||
impl AvxU8x16 for u8x16 {}
|
||||
pub trait AvxI8x16 {}
|
||||
impl AvxI8x16 for i8x16 {}
|
||||
|
||||
pub trait AvxBool8ix16 {}
|
||||
impl AvxBool8ix16 for bool8ix16 {}
|
|
@ -0,0 +1,65 @@
|
|||
use x86::avx::*;
|
||||
|
||||
#[allow(dead_code)]
|
||||
extern "platform-intrinsic" {
|
||||
fn x86_mm256_abs_epi8(x: i8x32) -> i8x32;
|
||||
fn x86_mm256_abs_epi16(x: i16x16) -> i16x16;
|
||||
fn x86_mm256_abs_epi32(x: i32x8) -> i32x8;
|
||||
fn x86_mm256_adds_epi8(x: i8x32, y: i8x32) -> i8x32;
|
||||
fn x86_mm256_adds_epu8(x: u8x32, y: u8x32) -> u8x32;
|
||||
fn x86_mm256_adds_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_adds_epu16(x: u16x16, y: u16x16) -> u16x16;
|
||||
fn x86_mm256_avg_epu8(x: u8x32, y: u8x32) -> u8x32;
|
||||
fn x86_mm256_avg_epu16(x: u16x16, y: u16x16) -> u16x16;
|
||||
fn x86_mm256_hadd_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_hadd_epi32(x: i32x8, y: i32x8) -> i32x8;
|
||||
fn x86_mm256_hadds_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_hsub_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_hsub_epi32(x: i32x8, y: i32x8) -> i32x8;
|
||||
fn x86_mm256_hsubs_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_madd_epi16(x: i16x16, y: i16x16) -> i32x8;
|
||||
fn x86_mm256_maddubs_epi16(x: i8x32, y: i8x32) -> i16x16;
|
||||
fn x86_mm256_max_epi8(x: i8x32, y: i8x32) -> i8x32;
|
||||
fn x86_mm256_max_epu8(x: u8x32, y: u8x32) -> u8x32;
|
||||
fn x86_mm256_max_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_max_epu16(x: u16x16, y: u16x16) -> u16x16;
|
||||
fn x86_mm256_max_epi32(x: i32x8, y: i32x8) -> i32x8;
|
||||
fn x86_mm256_max_epu32(x: u32x8, y: u32x8) -> u32x8;
|
||||
fn x86_mm256_min_epi8(x: i8x32, y: i8x32) -> i8x32;
|
||||
fn x86_mm256_min_epu8(x: u8x32, y: u8x32) -> u8x32;
|
||||
fn x86_mm256_min_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_min_epu16(x: u16x16, y: u16x16) -> u16x16;
|
||||
fn x86_mm256_min_epi32(x: i32x8, y: i32x8) -> i32x8;
|
||||
fn x86_mm256_min_epu32(x: u32x8, y: u32x8) -> u32x8;
|
||||
fn x86_mm256_mul_epi64(x: i32x8, y: i32x8) -> i64x4;
|
||||
fn x86_mm256_mul_epu64(x: u32x8, y: u32x8) -> u64x4;
|
||||
fn x86_mm256_mulhi_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_mulhi_epu16(x: u16x16, y: u16x16) -> u16x16;
|
||||
fn x86_mm256_mulhrs_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_packs_epi16(x: i16x16, y: i16x16) -> i8x32;
|
||||
fn x86_mm256_packus_epi16(x: i16x16, y: i16x16) -> u8x32;
|
||||
fn x86_mm256_packs_epi32(x: i32x8, y: i32x8) -> i16x16;
|
||||
fn x86_mm256_packus_epi32(x: i32x8, y: i32x8) -> u16x16;
|
||||
fn x86_mm256_permutevar8x32_epi32(x: i32x8, y: i32x8) -> i32x8;
|
||||
fn x86_mm256_permutevar8x32_ps(x: f32x8, y: i32x8) -> f32x8;
|
||||
fn x86_mm256_sad_epu8(x: u8x32, y: u8x32) -> u8x32;
|
||||
fn x86_mm256_shuffle_epi8(x: i8x32, y: i8x32) -> i8x32;
|
||||
fn x86_mm256_sign_epi8(x: i8x32, y: i8x32) -> i8x32;
|
||||
fn x86_mm256_sign_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_sign_epi32(x: i32x8, y: i32x8) -> i32x8;
|
||||
fn x86_mm256_subs_epi8(x: i8x32, y: i8x32) -> i8x32;
|
||||
fn x86_mm256_subs_epu8(x: u8x32, y: u8x32) -> u8x32;
|
||||
fn x86_mm256_subs_epi16(x: i16x16, y: i16x16) -> i16x16;
|
||||
fn x86_mm256_subs_epu16(x: u16x16, y: u16x16) -> u16x16;
|
||||
}
|
||||
|
||||
// broken on rustc 1.7.0-nightly (1ddaf8bdf 2015-12-12)
|
||||
// pub trait Avx2F32x8 {
|
||||
// fn permutevar(self, other: i32x8) -> f32x8;
|
||||
// }
|
||||
//
|
||||
// impl Avx2F32x8 for f32x8 {
|
||||
// fn permutevar(self, other: i32x8) -> f32x8 {
|
||||
// unsafe { x86_mm256_permutevar8x32_ps(self, other) }
|
||||
// }
|
||||
// }
|
|
@ -0,0 +1,16 @@
|
|||
//! Features specific to x86 and x86-64 CPUs.
|
||||
|
||||
#[cfg(any(feature = "doc", target_feature = "sse2"))]
|
||||
pub mod sse2;
|
||||
#[cfg(any(feature = "doc", target_feature = "sse3"))]
|
||||
pub mod sse3;
|
||||
#[cfg(any(feature = "doc", target_feature = "ssse3"))]
|
||||
pub mod ssse3;
|
||||
#[cfg(any(feature = "doc", target_feature = "sse4.1"))]
|
||||
pub mod sse4_1;
|
||||
#[cfg(any(feature = "doc", target_feature = "sse4.2"))]
|
||||
pub mod sse4_2;
|
||||
#[cfg(any(feature = "doc", target_feature = "avx"))]
|
||||
pub mod avx;
|
||||
#[cfg(any(feature = "doc", target_feature = "avx2"))]
|
||||
pub mod avx2;
|
|
@ -0,0 +1,359 @@
|
|||
use super::super::*;
|
||||
use {bitcast, simd_cast, f32x2};
|
||||
|
||||
pub use sixty_four::{f64x2, i64x2, u64x2, bool64ix2, bool64fx2};
|
||||
|
||||
//pub use super::{u64x2, i64x2, f64x2, bool64ix2, bool64fx2};
|
||||
|
||||
// strictly speaking, these are SSE instructions, not SSE2.
|
||||
extern "platform-intrinsic" {
|
||||
fn x86_mm_movemask_ps(x: f32x4) -> i32;
|
||||
fn x86_mm_max_ps(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn x86_mm_min_ps(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn x86_mm_rsqrt_ps(x: f32x4) -> f32x4;
|
||||
fn x86_mm_rcp_ps(x: f32x4) -> f32x4;
|
||||
fn x86_mm_sqrt_ps(x: f32x4) -> f32x4;
|
||||
}
|
||||
|
||||
extern "platform-intrinsic" {
|
||||
fn x86_mm_adds_epi8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn x86_mm_adds_epu8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn x86_mm_adds_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_adds_epu16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn x86_mm_avg_epu8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn x86_mm_avg_epu16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn x86_mm_madd_epi16(x: i16x8, y: i16x8) -> i32x4;
|
||||
fn x86_mm_max_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_max_epu8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn x86_mm_max_pd(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn x86_mm_min_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_min_epu8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn x86_mm_min_pd(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn x86_mm_movemask_pd(x: f64x2) -> i32;
|
||||
fn x86_mm_movemask_epi8(x: i8x16) -> i32;
|
||||
fn x86_mm_mul_epu32(x: u32x4, y: u32x4) -> u64x2;
|
||||
fn x86_mm_mulhi_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_mulhi_epu16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn x86_mm_packs_epi16(x: i16x8, y: i16x8) -> i8x16;
|
||||
fn x86_mm_packs_epi32(x: i32x4, y: i32x4) -> i16x8;
|
||||
fn x86_mm_packus_epi16(x: i16x8, y: i16x8) -> u8x16;
|
||||
fn x86_mm_sad_epu8(x: u8x16, y: u8x16) -> u64x2;
|
||||
fn x86_mm_sqrt_pd(x: f64x2) -> f64x2;
|
||||
fn x86_mm_subs_epi8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn x86_mm_subs_epu8(x: u8x16, y: u8x16) -> u8x16;
|
||||
fn x86_mm_subs_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_subs_epu16(x: u16x8, y: u16x8) -> u16x8;
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub mod common {
|
||||
use super::super::super::*;
|
||||
use std::mem;
|
||||
|
||||
#[inline]
|
||||
pub fn f32x4_sqrt(x: f32x4) -> f32x4 {
|
||||
unsafe {super::x86_mm_sqrt_ps(x)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_approx_rsqrt(x: f32x4) -> f32x4 {
|
||||
unsafe {super::x86_mm_rsqrt_ps(x)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_approx_reciprocal(x: f32x4) -> f32x4 {
|
||||
unsafe {super::x86_mm_rcp_ps(x)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_max(x: f32x4, y: f32x4) -> f32x4 {
|
||||
unsafe {super::x86_mm_max_ps(x, y)}
|
||||
}
|
||||
#[inline]
|
||||
pub fn f32x4_min(x: f32x4, y: f32x4) -> f32x4 {
|
||||
unsafe {super::x86_mm_min_ps(x, y)}
|
||||
}
|
||||
|
||||
macro_rules! bools {
|
||||
($($ty: ty, $all: ident, $any: ident, $movemask: ident, $width: expr;)*) => {
|
||||
$(
|
||||
#[inline]
|
||||
pub fn $all(x: $ty) -> bool {
|
||||
unsafe {
|
||||
super::$movemask(mem::transmute(x)) == (1 << $width) - 1
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
pub fn $any(x: $ty) -> bool {
|
||||
unsafe {
|
||||
super::$movemask(mem::transmute(x)) != 0
|
||||
}
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
bools! {
|
||||
bool32fx4, bool32fx4_all, bool32fx4_any, x86_mm_movemask_ps, 4;
|
||||
bool8ix16, bool8ix16_all, bool8ix16_any, x86_mm_movemask_epi8, 16;
|
||||
bool16ix8, bool16ix8_all, bool16ix8_any, x86_mm_movemask_epi8, 16;
|
||||
bool32ix4, bool32ix4_all, bool32ix4_any, x86_mm_movemask_epi8, 16;
|
||||
}
|
||||
}
|
||||
|
||||
// 32 bit floats
|
||||
|
||||
pub trait Sse2F32x4 {
|
||||
fn to_f64(self) -> f64x2;
|
||||
fn move_mask(self) -> u32;
|
||||
}
|
||||
impl Sse2F32x4 for f32x4 {
|
||||
#[inline]
|
||||
fn to_f64(self) -> f64x2 {
|
||||
unsafe {
|
||||
simd_cast(f32x2(self.0, self.1))
|
||||
}
|
||||
}
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe {x86_mm_movemask_ps(self) as u32}
|
||||
}
|
||||
}
|
||||
pub trait Sse2Bool32fx4 {
|
||||
fn move_mask(self) -> u32;
|
||||
}
|
||||
impl Sse2Bool32fx4 for bool32fx4 {
|
||||
#[inline]
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe { x86_mm_movemask_ps(bitcast(self)) as u32}
|
||||
}
|
||||
}
|
||||
|
||||
// 64 bit floats
|
||||
|
||||
pub trait Sse2F64x2 {
|
||||
fn move_mask(self) -> u32;
|
||||
fn sqrt(self) -> Self;
|
||||
fn max(self, other: Self) -> Self;
|
||||
fn min(self, other: Self) -> Self;
|
||||
}
|
||||
impl Sse2F64x2 for f64x2 {
|
||||
#[inline]
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe { x86_mm_movemask_pd(bitcast(self)) as u32}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sqrt(self) -> Self {
|
||||
unsafe { x86_mm_sqrt_pd(self) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_max_pd(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn min(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_min_pd(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Sse2Bool64fx2 {
|
||||
fn move_mask(self) -> u32;
|
||||
}
|
||||
impl Sse2Bool64fx2 for bool64fx2 {
|
||||
#[inline]
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe { x86_mm_movemask_pd(bitcast(self)) as u32}
|
||||
}
|
||||
}
|
||||
|
||||
// 64 bit ints
|
||||
|
||||
pub trait Sse2U64x2 {}
|
||||
impl Sse2U64x2 for u64x2 {}
|
||||
|
||||
pub trait Sse2I64x2 {}
|
||||
impl Sse2I64x2 for i64x2 {}
|
||||
|
||||
pub trait Sse2Bool64ix2 {}
|
||||
impl Sse2Bool64ix2 for bool64ix2 {}
|
||||
|
||||
// 32 bit ints
|
||||
|
||||
pub trait Sse2U32x4 {
|
||||
fn low_mul(self, other: Self) -> u64x2;
|
||||
}
|
||||
impl Sse2U32x4 for u32x4 {
|
||||
#[inline]
|
||||
fn low_mul(self, other: Self) -> u64x2 {
|
||||
unsafe { x86_mm_mul_epu32(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Sse2I32x4 {
|
||||
fn packs(self, other: Self) -> i16x8;
|
||||
}
|
||||
impl Sse2I32x4 for i32x4 {
|
||||
#[inline]
|
||||
fn packs(self, other: Self) -> i16x8 {
|
||||
unsafe { x86_mm_packs_epi32(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Sse2Bool32ix4 {}
|
||||
impl Sse2Bool32ix4 for bool32ix4 {}
|
||||
|
||||
// 16 bit ints
|
||||
|
||||
pub trait Sse2U16x8 {
|
||||
fn adds(self, other: Self) -> Self;
|
||||
fn subs(self, other: Self) -> Self;
|
||||
fn avg(self, other: Self) -> Self;
|
||||
fn mulhi(self, other: Self) -> Self;
|
||||
}
|
||||
impl Sse2U16x8 for u16x8 {
|
||||
#[inline]
|
||||
fn adds(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_adds_epu16(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn subs(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_subs_epu16(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn avg(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_avg_epu16(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn mulhi(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_mulhi_epu16(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Sse2I16x8 {
|
||||
fn adds(self, other: Self) -> Self;
|
||||
fn subs(self, other: Self) -> Self;
|
||||
fn madd(self, other: Self) -> i32x4;
|
||||
fn max(self, other: Self) -> Self;
|
||||
fn min(self, other: Self) -> Self;
|
||||
fn mulhi(self, other: Self) -> Self;
|
||||
fn packs(self, other: Self) -> i8x16;
|
||||
fn packus(self, other: Self) -> u8x16;
|
||||
}
|
||||
impl Sse2I16x8 for i16x8 {
|
||||
#[inline]
|
||||
fn adds(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_adds_epi16(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn subs(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_subs_epi16(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn madd(self, other: Self) -> i32x4 {
|
||||
unsafe { x86_mm_madd_epi16(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_max_epi16(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn min(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_min_epi16(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn mulhi(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_mulhi_epi16(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn packs(self, other: Self) -> i8x16 {
|
||||
unsafe { x86_mm_packs_epi16(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn packus(self, other: Self) -> u8x16 {
|
||||
unsafe { x86_mm_packus_epi16(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Sse2Bool16ix8 {}
|
||||
impl Sse2Bool16ix8 for bool16ix8 {}
|
||||
|
||||
// 8 bit ints
|
||||
|
||||
pub trait Sse2U8x16 {
|
||||
fn move_mask(self) -> u32;
|
||||
fn adds(self, other: Self) -> Self;
|
||||
fn subs(self, other: Self) -> Self;
|
||||
fn avg(self, other: Self) -> Self;
|
||||
fn max(self, other: Self) -> Self;
|
||||
fn min(self, other: Self) -> Self;
|
||||
fn sad(self, other: Self) -> u64x2;
|
||||
}
|
||||
impl Sse2U8x16 for u8x16 {
|
||||
#[inline]
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe { x86_mm_movemask_epi8(bitcast(self)) as u32}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn adds(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_adds_epu8(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn subs(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_subs_epu8(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn avg(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_avg_epu8(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_max_epu8(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn min(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_min_epu8(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sad(self, other: Self) -> u64x2 {
|
||||
unsafe { x86_mm_sad_epu8(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Sse2I8x16 {
|
||||
fn move_mask(self) -> u32;
|
||||
fn adds(self, other: Self) -> Self;
|
||||
fn subs(self, other: Self) -> Self;
|
||||
}
|
||||
impl Sse2I8x16 for i8x16 {
|
||||
#[inline]
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe { x86_mm_movemask_epi8(bitcast(self)) as u32}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn adds(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_adds_epi8(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn subs(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_subs_epi8(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Sse2Bool8ix16 {
|
||||
fn move_mask(self) -> u32;
|
||||
}
|
||||
impl Sse2Bool8ix16 for bool8ix16 {
|
||||
#[inline]
|
||||
fn move_mask(self) -> u32 {
|
||||
unsafe { x86_mm_movemask_epi8(bitcast(self)) as u32}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
use sixty_four::*;
|
||||
use super::super::*;
|
||||
|
||||
extern "platform-intrinsic" {
|
||||
fn x86_mm_addsub_ps(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn x86_mm_addsub_pd(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn x86_mm_hadd_ps(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn x86_mm_hadd_pd(x: f64x2, y: f64x2) -> f64x2;
|
||||
fn x86_mm_hsub_ps(x: f32x4, y: f32x4) -> f32x4;
|
||||
fn x86_mm_hsub_pd(x: f64x2, y: f64x2) -> f64x2;
|
||||
}
|
||||
|
||||
pub trait Sse3F32x4 {
|
||||
fn addsub(self, other: Self) -> Self;
|
||||
fn hadd(self, other: Self) -> Self;
|
||||
fn hsub(self, other: Self) -> Self;
|
||||
}
|
||||
|
||||
impl Sse3F32x4 for f32x4 {
|
||||
#[inline]
|
||||
fn addsub(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_addsub_ps(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hadd(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_hadd_ps(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hsub(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_hsub_ps(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Sse3F64x2 {
|
||||
fn addsub(self, other: Self) -> Self;
|
||||
fn hadd(self, other: Self) -> Self;
|
||||
fn hsub(self, other: Self) -> Self;
|
||||
}
|
||||
|
||||
impl Sse3F64x2 for f64x2 {
|
||||
#[inline]
|
||||
fn addsub(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_addsub_pd(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hadd(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_hadd_pd(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hsub(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_hsub_pd(self, other) }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,155 @@
|
|||
use super::super::*;
|
||||
use x86::sse2::*;
|
||||
|
||||
#[allow(dead_code)]
|
||||
extern "platform-intrinsic" {
|
||||
fn x86_mm_dp_ps(x: f32x4, y: f32x4, z: i32) -> f32x4;
|
||||
fn x86_mm_dp_pd(x: f64x2, y: f64x2, z: i32) -> f64x2;
|
||||
fn x86_mm_max_epi8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn x86_mm_max_epu16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn x86_mm_max_epi32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn x86_mm_max_epu32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn x86_mm_min_epi8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn x86_mm_min_epu16(x: u16x8, y: u16x8) -> u16x8;
|
||||
fn x86_mm_min_epi32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn x86_mm_min_epu32(x: u32x4, y: u32x4) -> u32x4;
|
||||
fn x86_mm_minpos_epu16(x: u16x8) -> u16x8;
|
||||
fn x86_mm_mpsadbw_epu8(x: u8x16, y: u8x16, z: i32) -> u16x8;
|
||||
fn x86_mm_mul_epi32(x: i32x4, y: i32x4) -> i64x2;
|
||||
fn x86_mm_packus_epi32(x: i32x4, y: i32x4) -> u16x8;
|
||||
fn x86_mm_testc_si128(x: u64x2, y: u64x2) -> i32;
|
||||
fn x86_mm_testnzc_si128(x: u64x2, y: u64x2) -> i32;
|
||||
fn x86_mm_testz_si128(x: u64x2, y: u64x2) -> i32;
|
||||
}
|
||||
|
||||
// 32 bit floats
|
||||
|
||||
pub trait Sse41F32x4 {}
|
||||
impl Sse41F32x4 for f32x4 {}
|
||||
|
||||
// 64 bit floats
|
||||
|
||||
pub trait Sse41F64x2 {}
|
||||
impl Sse41F64x2 for f64x2 {}
|
||||
|
||||
// 64 bit integers
|
||||
|
||||
pub trait Sse41U64x2 {
|
||||
fn testc(self, other: Self) -> i32;
|
||||
fn testnzc(self, other: Self) -> i32;
|
||||
fn testz(self, other: Self) -> i32;
|
||||
}
|
||||
impl Sse41U64x2 for u64x2 {
|
||||
#[inline]
|
||||
fn testc(self, other: Self) -> i32 {
|
||||
unsafe { x86_mm_testc_si128(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn testnzc(self, other: Self) -> i32 {
|
||||
unsafe { x86_mm_testnzc_si128(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn testz(self, other: Self) -> i32 {
|
||||
unsafe { x86_mm_testz_si128(self, other) }
|
||||
}
|
||||
}
|
||||
pub trait Sse41I64x2 {}
|
||||
impl Sse41I64x2 for i64x2 {}
|
||||
|
||||
pub trait Sse41Bool64ix2 {}
|
||||
impl Sse41Bool64ix2 for bool64ix2 {}
|
||||
|
||||
// 32 bit integers
|
||||
|
||||
pub trait Sse41U32x4 {
|
||||
fn max(self, other: Self) -> Self;
|
||||
fn min(self, other: Self) -> Self;
|
||||
}
|
||||
impl Sse41U32x4 for u32x4 {
|
||||
#[inline]
|
||||
fn max(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_max_epu32(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn min(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_min_epu32(self, other) }
|
||||
}
|
||||
}
|
||||
pub trait Sse41I32x4 {
|
||||
fn max(self, other: Self) -> Self;
|
||||
fn min(self, other: Self) -> Self;
|
||||
fn low_mul(self, other: Self) -> i64x2;
|
||||
fn packus(self, other: Self) -> u16x8;
|
||||
}
|
||||
impl Sse41I32x4 for i32x4 {
|
||||
#[inline]
|
||||
fn max(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_max_epi32(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn min(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_min_epi32(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn low_mul(self, other: Self) -> i64x2 {
|
||||
unsafe { x86_mm_mul_epi32(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn packus(self, other: Self) -> u16x8 {
|
||||
unsafe { x86_mm_packus_epi32(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Sse41Bool32ix4 {}
|
||||
impl Sse41Bool32ix4 for bool32ix4 {}
|
||||
|
||||
// 16 bit integers
|
||||
|
||||
pub trait Sse41U16x8 {
|
||||
fn max(self, other: Self) -> Self;
|
||||
fn min(self, other: Self) -> Self;
|
||||
fn minpos(self) -> Self;
|
||||
}
|
||||
impl Sse41U16x8 for u16x8 {
|
||||
#[inline]
|
||||
fn max(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_max_epu16(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn min(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_min_epu16(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn minpos(self) -> Self {
|
||||
unsafe { x86_mm_minpos_epu16(self) }
|
||||
}
|
||||
}
|
||||
pub trait Sse41I16x8 {}
|
||||
impl Sse41I16x8 for i16x8 {}
|
||||
|
||||
pub trait Sse41Bool16ix8 {}
|
||||
impl Sse41Bool16ix8 for bool16ix8 {}
|
||||
|
||||
// 8 bit integers
|
||||
|
||||
pub trait Sse41U8x16 {}
|
||||
impl Sse41U8x16 for u8x16 {}
|
||||
pub trait Sse41I8x16 {
|
||||
fn max(self, other: Self) -> Self;
|
||||
fn min(self, other: Self) -> Self;
|
||||
}
|
||||
impl Sse41I8x16 for i8x16 {
|
||||
#[inline]
|
||||
fn max(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_max_epi8(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn min(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_min_epi8(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Sse41Bool8ix16 {}
|
||||
impl Sse41Bool8ix16 for bool8ix16 {}
|
|
@ -0,0 +1,19 @@
|
|||
use i8x16;
|
||||
|
||||
#[allow(dead_code)]
|
||||
extern "platform-intrinsic" {
|
||||
fn x86_mm_cmpestra(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
|
||||
fn x86_mm_cmpestrc(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
|
||||
fn x86_mm_cmpestri(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
|
||||
fn x86_mm_cmpestrm(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i8x16;
|
||||
fn x86_mm_cmpestro(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
|
||||
fn x86_mm_cmpestrs(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
|
||||
fn x86_mm_cmpestrz(x: i8x16, y: i32, z: i8x16, w: i32, a: i32) -> i32;
|
||||
fn x86_mm_cmpistra(x: i8x16, y: i8x16, z: i32) -> i32;
|
||||
fn x86_mm_cmpistrc(x: i8x16, y: i8x16, z: i32) -> i32;
|
||||
fn x86_mm_cmpistri(x: i8x16, y: i8x16, z: i32) -> i32;
|
||||
fn x86_mm_cmpistrm(x: i8x16, y: i8x16, z: i32) -> i8x16;
|
||||
fn x86_mm_cmpistro(x: i8x16, y: i8x16, z: i32) -> i32;
|
||||
fn x86_mm_cmpistrs(x: i8x16, y: i8x16, z: i32) -> i32;
|
||||
fn x86_mm_cmpistrz(x: i8x16, y: i8x16, z: i32) -> i32;
|
||||
}
|
|
@ -0,0 +1,172 @@
|
|||
use super::super::*;
|
||||
use bitcast;
|
||||
|
||||
macro_rules! bitcast {
|
||||
($func: ident($a: ident, $b: ident)) => {
|
||||
bitcast($func(bitcast($a), bitcast($b)))
|
||||
}
|
||||
}
|
||||
|
||||
extern "platform-intrinsic" {
|
||||
fn x86_mm_abs_epi8(x: i8x16) -> i8x16;
|
||||
fn x86_mm_abs_epi16(x: i16x8) -> i16x8;
|
||||
fn x86_mm_abs_epi32(x: i32x4) -> i32x4;
|
||||
fn x86_mm_hadd_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_hadd_epi32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn x86_mm_hadds_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_hsub_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_hsub_epi32(x: i32x4, y: i32x4) -> i32x4;
|
||||
fn x86_mm_hsubs_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_maddubs_epi16(x: u8x16, y: i8x16) -> i16x8;
|
||||
fn x86_mm_mulhrs_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_shuffle_epi8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn x86_mm_sign_epi8(x: i8x16, y: i8x16) -> i8x16;
|
||||
fn x86_mm_sign_epi16(x: i16x8, y: i16x8) -> i16x8;
|
||||
fn x86_mm_sign_epi32(x: i32x4, y: i32x4) -> i32x4;
|
||||
}
|
||||
|
||||
// 32 bit integers
|
||||
|
||||
pub trait Ssse3I32x4 {
|
||||
fn abs(self) -> Self;
|
||||
fn hadd(self, other: Self) -> Self;
|
||||
fn hsub(self, other: Self) -> Self;
|
||||
fn sign(self, other: Self) -> Self;
|
||||
}
|
||||
impl Ssse3I32x4 for i32x4 {
|
||||
#[inline]
|
||||
fn abs(self) -> Self {
|
||||
unsafe { x86_mm_abs_epi32(self) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hadd(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_hadd_epi32(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn hsub(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_hsub_epi32(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sign(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_sign_epi32(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Ssse3U32x4 {
|
||||
fn hadd(self, other: Self) -> Self;
|
||||
fn hsub(self, other: Self) -> Self;
|
||||
}
|
||||
impl Ssse3U32x4 for u32x4 {
|
||||
#[inline]
|
||||
fn hadd(self, other: Self) -> Self {
|
||||
unsafe { bitcast!(x86_mm_hadd_epi32(self, other)) }
|
||||
}
|
||||
#[inline]
|
||||
fn hsub(self, other: Self) -> Self {
|
||||
unsafe { bitcast!(x86_mm_hsub_epi32(self, other)) }
|
||||
}
|
||||
}
|
||||
|
||||
// 16 bit integers
|
||||
|
||||
pub trait Ssse3I16x8 {
|
||||
fn abs(self) -> Self;
|
||||
fn hadd(self, other: Self) -> Self;
|
||||
fn hadds(self, other: Self) -> Self;
|
||||
fn hsub(self, other: Self) -> Self;
|
||||
fn hsubs(self, other: Self) -> Self;
|
||||
fn sign(self, other: Self) -> Self;
|
||||
fn mulhrs(self, other: Self) -> Self;
|
||||
}
|
||||
impl Ssse3I16x8 for i16x8 {
|
||||
#[inline]
|
||||
fn abs(self) -> Self {
|
||||
unsafe { x86_mm_abs_epi16(self) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hadd(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_hadd_epi16(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn hadds(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_hadds_epi16(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn hsub(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_hsub_epi16(self, other) }
|
||||
}
|
||||
#[inline]
|
||||
fn hsubs(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_hsubs_epi16(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sign(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_sign_epi16(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn mulhrs(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_mulhrs_epi16(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Ssse3U16x8 {
|
||||
fn hadd(self, other: Self) -> Self;
|
||||
fn hsub(self, other: Self) -> Self;
|
||||
}
|
||||
impl Ssse3U16x8 for u16x8 {
|
||||
#[inline]
|
||||
fn hadd(self, other: Self) -> Self {
|
||||
unsafe { bitcast!(x86_mm_hadd_epi16(self, other)) }
|
||||
}
|
||||
#[inline]
|
||||
fn hsub(self, other: Self) -> Self {
|
||||
unsafe { bitcast!(x86_mm_hsub_epi16(self, other)) }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// 8 bit integers
|
||||
|
||||
pub trait Ssse3U8x16 {
|
||||
fn shuffle_bytes(self, indices: Self) -> Self;
|
||||
fn maddubs(self, other: i8x16) -> i16x8;
|
||||
}
|
||||
|
||||
impl Ssse3U8x16 for u8x16 {
|
||||
#[inline]
|
||||
fn shuffle_bytes(self, indices: Self) -> Self {
|
||||
unsafe {bitcast!(x86_mm_shuffle_epi8(self, indices))}
|
||||
}
|
||||
|
||||
fn maddubs(self, other: i8x16) -> i16x8 {
|
||||
unsafe { x86_mm_maddubs_epi16(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Ssse3I8x16 {
|
||||
fn abs(self) -> Self;
|
||||
fn shuffle_bytes(self, indices: Self) -> Self;
|
||||
fn sign(self, other: Self) -> Self;
|
||||
}
|
||||
impl Ssse3I8x16 for i8x16 {
|
||||
#[inline]
|
||||
fn abs(self) -> Self {
|
||||
unsafe {x86_mm_abs_epi8(self)}
|
||||
}
|
||||
#[inline]
|
||||
fn shuffle_bytes(self, indices: Self) -> Self {
|
||||
unsafe {
|
||||
x86_mm_shuffle_epi8(self, indices)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sign(self, other: Self) -> Self {
|
||||
unsafe { x86_mm_sign_epi8(self, other) }
|
||||
}
|
||||
}
|
|
@ -301,6 +301,7 @@ version = "0.6.11"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -852,6 +853,11 @@ dependencies = [
|
|||
"nodrop 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "simd"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "0.2.1"
|
||||
|
@ -1295,6 +1301,7 @@ dependencies = [
|
|||
"checksum serde 0.9.9 (registry+https://github.com/rust-lang/crates.io-index)" = "05a67b8a53f885f4b6e3ed183806035819f9862474e747fe4488a6d63bcbfcb7"
|
||||
"checksum serde_codegen_internals 0.14.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d52006899f910528a10631e5b727973fe668f3228109d1707ccf5bad5490b6e"
|
||||
"checksum serde_derive 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)" = "f15ea24bd037b2d64646b4d934fa99c649be66e3f7b29fb595a5543b212b1452"
|
||||
"checksum simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"
|
||||
"checksum siphasher 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2ffc669b726f2bc9a3bcff66e5e23b56ba6bf70e22a34c3d7b6d0b3450b65b84"
|
||||
"checksum smallvec 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "4f8266519bc1d17d0b5b16f6c21295625d562841c708f6376f49028a43e9c11e"
|
||||
"checksum smallvec 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2e40af10aafe98b4d8294ae8388d8a5cd0707c65d364872efe72d063ec44bee0"
|
||||
|
|
|
@ -11,7 +11,7 @@ servo = ["gkrust-shared/servo"]
|
|||
quantum_render = ["gkrust-shared/quantum_render"]
|
||||
cubeb_pulse_rust = ["gkrust-shared/cubeb_pulse_rust"]
|
||||
gecko_debug = ["gkrust-shared/gecko_debug"]
|
||||
# simd-accel = ["gkrust-shared/simd-accel"]
|
||||
simd-accel = ["gkrust-shared/simd-accel"]
|
||||
no-static-ideograph-encoder-tables = ["gkrust-shared/no-static-ideograph-encoder-tables"]
|
||||
# parallel-utf8 = ["gkrust-shared/parallel-utf8"]
|
||||
|
||||
|
|
|
@ -299,6 +299,7 @@ version = "0.6.11"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -839,6 +840,11 @@ dependencies = [
|
|||
"nodrop 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "simd"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "0.2.1"
|
||||
|
@ -1282,6 +1288,7 @@ dependencies = [
|
|||
"checksum serde 0.9.9 (registry+https://github.com/rust-lang/crates.io-index)" = "05a67b8a53f885f4b6e3ed183806035819f9862474e747fe4488a6d63bcbfcb7"
|
||||
"checksum serde_codegen_internals 0.14.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d52006899f910528a10631e5b727973fe668f3228109d1707ccf5bad5490b6e"
|
||||
"checksum serde_derive 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)" = "f15ea24bd037b2d64646b4d934fa99c649be66e3f7b29fb595a5543b212b1452"
|
||||
"checksum simd 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a94d14a2ae1f1f110937de5fb69e494372560181c7e1739a097fcc2cee37ba0"
|
||||
"checksum siphasher 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2ffc669b726f2bc9a3bcff66e5e23b56ba6bf70e22a34c3d7b6d0b3450b65b84"
|
||||
"checksum smallvec 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "4f8266519bc1d17d0b5b16f6c21295625d562841c708f6376f49028a43e9c11e"
|
||||
"checksum smallvec 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2e40af10aafe98b4d8294ae8388d8a5cd0707c65d364872efe72d063ec44bee0"
|
||||
|
|
|
@ -11,7 +11,7 @@ servo = ["gkrust-shared/servo"]
|
|||
quantum_render = ["gkrust-shared/quantum_render"]
|
||||
cubeb_pulse_rust = ["gkrust-shared/cubeb_pulse_rust"]
|
||||
gecko_debug = ["gkrust-shared/gecko_debug"]
|
||||
# simd-accel = ["gkrust-shared/simd-accel"]
|
||||
simd-accel = ["gkrust-shared/simd-accel"]
|
||||
no-static-ideograph-encoder-tables = ["gkrust-shared/no-static-ideograph-encoder-tables"]
|
||||
# parallel-utf8 = ["gkrust-shared/parallel-utf8"]
|
||||
|
||||
|
|
|
@ -20,4 +20,7 @@ if CONFIG['MOZ_BUILD_WEBRENDER']:
|
|||
if CONFIG['MOZ_PULSEAUDIO']:
|
||||
gkrust_features += ['cubeb_pulse_rust']
|
||||
|
||||
if CONFIG['MOZ_RUST_SIMD']:
|
||||
gkrust_features += ['simd-accel']
|
||||
|
||||
gkrust_features += ['no-static-ideograph-encoder-tables']
|
||||
|
|
|
@ -795,6 +795,19 @@ set_config('MOZ_BUILD_WEBRENDER', webrender.build)
|
|||
set_define('MOZ_BUILD_WEBRENDER', webrender.build)
|
||||
set_config('MOZ_ENABLE_WEBRENDER', webrender.enable)
|
||||
|
||||
# SIMD acceleration for Rust code (currently just encoding_rs)
|
||||
|
||||
option('--enable-rust-simd', env='MOZ_RUST_SIMD',
|
||||
help='Enable explicit SIMD in Rust code.')
|
||||
|
||||
@depends('--enable-rust-simd')
|
||||
def rust_simd(value):
|
||||
if value:
|
||||
return True
|
||||
|
||||
set_config('MOZ_RUST_SIMD', rust_simd)
|
||||
set_define('MOZ_RUST_SIMD', rust_simd)
|
||||
|
||||
# Printing
|
||||
# ==============================================================
|
||||
@depends(target)
|
||||
|
|
Загрузка…
Ссылка в новой задаче