WIP on LLVM targets, got all of them working so far!

This commit is contained in:
Giovanni 2017-05-15 20:04:49 +09:00
Родитель 8f6a31de78
Коммит bdbd09de46
8 изменённых файлов: 516 добавлений и 801 удалений

Просмотреть файл

@ -31,113 +31,325 @@ THE SOFTWARE.
#ifndef nativemath_h
#define nativemath_h
#include "NativePath.h"
#include "ShaderFastMathLib.h"
#ifdef __cplusplus
extern "C" {
#endif
//ShaderFastMathLib
//
// Using 0 Newton Raphson iterations
// Relative error : ~3.4% over full
// Precise format : ~small float
// 2 ALU
//
extern float npFastRcpSqrtNR0(float inX);
//
// Using 1 Newton Raphson iterations
// Relative error : ~0.2% over full
// Precise format : ~half float
// 6 ALU
//
extern float npFastRcpSqrtNR1(float inX);
//
// Using 2 Newton Raphson iterations
// Relative error : ~4.6e-004% over full
// Precise format : ~full float
// 9 ALU
//
extern float npFastRcpSqrtNR2(float inX);
//
// Using 0 Newton Raphson iterations
// Relative error : < 0.7% over full
// Precise format : ~small float
// 1 ALU
//
extern float npFastSqrtNR0(float inX);
//
// Use inverse Rcp Sqrt
// Using 1 Newton Raphson iterations
// Relative error : ~0.2% over full
// Precise format : ~half float
// 6 ALU
//
extern float npFastSqrtNR1(float inX);
//
// Use inverse Rcp Sqrt
// Using 2 Newton Raphson iterations
// Relative error : ~4.6e-004% over full
// Precise format : ~full float
// 9 ALU
//
extern float npFastSqrtNR2(float inX);
//
// Using 0 Newton Raphson iterations
// Relative error : < 0.4% over full
// Precise format : ~small float
// 1 ALU
//
extern float npFastRcpNR0(float inX);
//
// Using 1 Newton Raphson iterations
// Relative error : < 0.02% over full
// Precise format : ~half float
// 3 ALU
//
extern float npFastRcpNR1(float inX);
//
// Using 2 Newton Raphson iterations
// Relative error : < 5.0e-005% over full
// Precise format : ~full float
// 5 ALU
//
extern float npFastRcpNR2(float inX);
// 4th order polynomial approximation
// 4 VGRP, 16 ALU Full Rate
// 7 * 10^-5 radians precision
// Reference : Handbook of Mathematical Functions (chapter : Elementary Transcendental Functions), M. Abramowitz and I.A. Stegun, Ed.
extern float npAcosFast4(float inX);
// 4th order polynomial approximation
// 4 VGRP, 16 ALU Full Rate
// 7 * 10^-5 radians precision
extern float npAsinFast4(float inX);
// 4th order hyperbolical approximation
// 4 VGRP, 12 ALU Full Rate
// 7 * 10^-5 radians precision
// Reference : Efficient approximations for the arctangent function, Rajan, S. Sichun Wang Inkol, R. Joyal, A., May 2006
extern float npAtanFast4(float inX);
//LoL engine fast math
extern double npLolFabs(double x);
extern double npLolSin(double x);
extern double npLolCos(double x);
extern void npLolSincos(double x, double *sinx, double *cosx);
extern void npLolSincosf(float x, float *sinx, float *cosx);
extern double npLolTan(double x);
#define FP_USE(x) (void)(x)
#define __likely(x) __builtin_expect(!!(x), 1)
#define __unlikely(x) __builtin_expect(!!(x), 0)
static const double D_PI = 3.1415926535897932384626433f;
static const double PI_2 = 1.57079632679489661923132;
static const double PI_4 = 0.785398163397448309615661;
static const double INV_PI = 0.318309886183790671537768;
static const double ROOT3 = 1.73205080756887729352745;
static const double ZERO = 0.0;
static const double ONE = 1.0;
static const double NEG_ONE = -1.0;
static const double HALF = 0.5;
static const double QUARTER = 0.25;
static const double TWO = 2.0;
static const double VERY_SMALL_NUMBER = 0x1.0p-128;
static const double TWO_EXP_52 = 4503599627370496.0;
static const double TWO_EXP_54 = 18014398509481984.0;
/** sin Taylor series coefficients. */
static const double SC[] =
{
-1.6449340668482264364724e-0, // π^2/3!
+8.1174242528335364363700e-1, // π^4/5!
-1.9075182412208421369647e-1, // π^6/7!
+2.6147847817654800504653e-2, // π^8/9!
-2.3460810354558236375089e-3, // π^10/11!
+1.4842879303107100368487e-4, // π^12/13!
-6.9758736616563804745344e-6, // π^14/15!
+2.5312174041370276513517e-7, // π^16/17!
};
/* Note: the last value should be -1.3878952462213772114468e-7 (ie.
* π^18/18!) but we tweak it in order to get the better average precision
* required for tan() computations when close to π/2+ values. */
static const double CC[] =
{
-4.9348022005446793094172e-0, // π^2/2!
+4.0587121264167682181850e-0, // π^4/4!
-1.3352627688545894958753e-0, // π^6/6!
+2.3533063035889320454188e-1, // π^8/8!
-2.5806891390014060012598e-2, // π^10/10!
+1.9295743094039230479033e-3, // π^12/12!
-1.0463810492484570711802e-4, // π^14/14!
+4.3030695870329470072978e-6, // π^16/16!
-1.3777e-7,
};
/* These coefficients use Sloanes http://oeis.org/A002430 and
* http://oeis.org/A036279 sequences for the Taylor series of tan().
* Note: the last value should be 2.12485922978838540352881e5 (ie.
* 443861162*π^18/1856156927625), but we tweak it in order to get
* sub 1e-11 average precision in a larger range. */
static const double TC[] =
{
3.28986813369645287294483e0, // π^2/3
1.29878788045336582981920e1, // 2*π^4/15
5.18844961612069061254404e1, // 17*π^6/315
2.07509320280908496804928e2, // 62*π^8/2835
8.30024701695986756361561e2, // 1382*π^10/155925
3.32009324029001216460018e3, // 21844*π^12/6081075
1.32803704909665483598490e4, // 929569*π^14/638512875
5.31214808666037709352112e4, // 6404582*π^16/10854718875
2.373e5,
};
static inline double npLolSin(double x)
{
double absx = __builtin_fabs(x * INV_PI);
/* If branches are cheap, skip the cycle count when |x| < π/4,
* and only do the Taylor series up to the required precision. */
#if LOL_FEATURE_CHEAP_BRANCHES
if (absx < QUARTER)
{
/* Computing x^4 is one multiplication too many we do, but it helps
* interleave the Taylor series operations a lot better. */
double x2 = absx * absx;
double x4 = x2 * x2;
double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double taylor = sub2 * x2 + sub1;
return x * taylor;
}
#endif
/* Wrap |x| to the range [-1, 1] and keep track of the number of
* cycles required. If odd, we'll need to change the sign of the
* result. */
double num_cycles = absx + TWO_EXP_52;
FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
double is_even = TWO * num_cycles - ONE;
FP_USE(is_even); is_even += TWO_EXP_54;
FP_USE(is_even); is_even -= TWO_EXP_54;
FP_USE(is_even);
is_even -= TWO * num_cycles - ONE;
double sign = is_even;
absx -= num_cycles;
/* If branches are very cheap, we have the option to do the Taylor
* series at a much lower degree by splitting. */
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
if (__builtin_fabs(absx) > QUARTER)
{
sign = (x * absx >= 0.0) ? sign : -sign;
double x1 = HALF - __builtin_fabs(absx);
double x2 = x1 * x1;
double x4 = x2 * x2;
double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
double taylor = sub2 * x2 + sub1;
return taylor * sign;
}
#endif
sign *= (x >= 0.0) ? D_PI : -D_PI;
/* Compute a Tailor series for sin() and combine sign information. */
double x2 = absx * absx;
double x4 = x2 * x2;
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
#else
double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
#endif
double taylor = sub2 * x2 + sub1;
return absx * taylor * sign;
}
static inline double npLolCos(double x)
{
double absx = __builtin_fabs(x * INV_PI);
#if LOL_FEATURE_CHEAP_BRANCHES
if (absx < QUARTER)
{
double x2 = absx * absx;
double x4 = x2 * x2;
double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
double taylor = (sub1 * x2 + sub2) * x2 + ONE;
return taylor;
}
#endif
double num_cycles = absx + TWO_EXP_52;
FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
double is_even = TWO * num_cycles - ONE;
FP_USE(is_even); is_even += TWO_EXP_54;
FP_USE(is_even); is_even -= TWO_EXP_54;
FP_USE(is_even);
is_even -= TWO * num_cycles - ONE;
double sign = is_even;
absx -= num_cycles;
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
if (__builtin_fabs(absx) > QUARTER)
{
double x1 = HALF - __builtin_fabs(absx);
double x2 = x1 * x1;
double x4 = x2 * x2;
double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double taylor = sub2 * x2 + sub1;
return x1 * taylor * sign * D_PI;
}
#endif
double x2 = absx * absx;
double x4 = x2 * x2;
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
#else
double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
#endif
double taylor = sub2 * x2 + sub1;
return taylor * sign;
}
static inline void npLolSincos(double x, double *sinx, double *cosx)
{
double absx = __builtin_fabs(x * INV_PI);
#if LOL_FEATURE_CHEAP_BRANCHES
if (absx < QUARTER)
{
double x2 = absx * absx;
double x4 = x2 * x2;
/* Computing the Taylor series to the 11th order is enough to get
* x * 1e-11 precision, but we push it to the 13th order so that
* tan() has a better precision. */
double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double taylors = subs2 * x2 + subs1;
*sinx = x * taylors;
double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
*cosx = taylorc;
return;
}
#endif
double num_cycles = absx + TWO_EXP_52;
FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
double is_even = TWO * num_cycles - ONE;
FP_USE(is_even); is_even += TWO_EXP_54;
FP_USE(is_even); is_even -= TWO_EXP_54;
FP_USE(is_even);
is_even -= TWO * num_cycles - ONE;
double sin_sign = is_even;
double cos_sign = is_even;
absx -= num_cycles;
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
if (__builtin_fabs(absx) > QUARTER)
{
cos_sign = sin_sign;
sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
double x1 = HALF - __builtin_fabs(absx);
double x2 = x1 * x1;
double x4 = x2 * x2;
double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
double taylors = subs2 * x2 + subs1;
*sinx = taylors * sin_sign;
double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double taylorc = subc2 * x2 + subc1;
*cosx = x1 * taylorc * cos_sign * D_PI;
return;
}
#endif
sin_sign *= (x >= 0.0) ? D_PI : -D_PI;
double x2 = absx * absx;
double x4 = x2 * x2;
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
#else
double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
/* Push Taylor series to the 19th order to enhance tan() accuracy. */
double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
#endif
double taylors = subs2 * x2 + subs1;
*sinx = absx * taylors * sin_sign;
double taylorc = subc2 * x2 + subc1;
*cosx = taylorc * cos_sign;
}
static inline double npLolTan(double x)
{
#if LOL_FEATURE_CHEAP_BRANCHES
double absx = __builtin_fabs(x * INV_PI);
/* This value was determined empirically to ensure an error of no
* more than x * 1e-11 in this range. */
if (absx < 0.163)
{
double x2 = absx * absx;
double x4 = x2 * x2;
double sub1 = (((TC[7] * x4 + TC[5]) * x4
+ TC[3]) * x4 + TC[1]) * x4 + ONE;
double sub2 = (((TC[8] * x4 + TC[6]) * x4
+ TC[4]) * x4 + TC[2]) * x4 + TC[0];
double taylor = sub2 * x2 + sub1;
return x * taylor;
}
#endif
double sinx, cosx;
npLolSincos(x, &sinx, &cosx);
/* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
double absc = __builtin_fabs(cosx);
if (__unlikely(absc < VERY_SMALL_NUMBER))
cosx = VERY_SMALL_NUMBER;
return sinx / cosx;
}
//Utility OpenCL vector goodies
@ -151,7 +363,7 @@ static inline float4 npTransformNormalF4(float4 normal, float4 matrix[4])
return normal.xxxx * matrix[0].xyzw + normal.yyyy * matrix[1].xyzw + normal.zzzz * matrix[2].xyzw + normal.wwww * matrix[3].xyzw;
}
static void npMatrixIdentityF4(float4* outMatrix)
static inline void npMatrixIdentityF4(float4* outMatrix)
{
outMatrix[0].yzw = 0.0f;
outMatrix[1].xzw = 0.0f;
@ -163,14 +375,9 @@ static void npMatrixIdentityF4(float4* outMatrix)
outMatrix[3].w = 1.0f;
}
static inline float sqrtf(float x)
static inline float npLengthF4(float4 vec)
{
return sqrt(x);
}
static float npLengthF4(float4 vec)
{
return sqrtf(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z + vec.w * vec.w);
return __builtin_sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z + vec.w * vec.w);
}
#ifdef __cplusplus

Просмотреть файл

@ -1591,16 +1591,13 @@ typedef uint32_t uint4 __attribute__((ext_vector_type(4)));
#define sqrt __builtin_sqrt
#endif
/* #if !__has_builtin(__builtin_sqrtf)
#if !__has_builtin(__builtin_sqrtf)
#error \"sqrtf clang built-in not available\"
// ff
extern float sqrtf(...);
#else
#define sqrtf __builtin_sqrtf
#endif */
#undef sqrtf
#define sqrtf sqrt
#endif
#if !__has_builtin(__builtin_sqrtl)
#error \"sqrtl clang built-in not available\"

Просмотреть файл

@ -56,13 +56,17 @@
#ifndef SHADER_FAST_MATH_INC_FX
#define SHADER_FAST_MATH_INC_FX
#ifdef __cplusplus
extern "C"
#endif
union _float_int
{
int i;
float f;
};
#include <math.h>
#include <NativePath.h>
// Derived from batch testing
// TODO : Should be improved
@ -100,7 +104,7 @@ union _float_int
//
// Approximate guess using integer float arithmetics based on IEEE floating point standard
float rcpSqrtIEEEIntApproximation(float inX, const int inRcpSqrtConst)
static inline float rcpSqrtIEEEIntApproximation(float inX, const int inRcpSqrtConst)
{
union _float_int x;
x.f = inX;
@ -108,7 +112,7 @@ float rcpSqrtIEEEIntApproximation(float inX, const int inRcpSqrtConst)
return x.f;
}
float rcpSqrtNewtonRaphson(float inXHalf, float inRcpX)
static inline float rcpSqrtNewtonRaphson(float inXHalf, float inRcpX)
{
return inRcpX * (-inXHalf * (inRcpX * inRcpX) + 1.5f);
}
@ -119,7 +123,7 @@ float rcpSqrtNewtonRaphson(float inXHalf, float inRcpX)
// Precise format : ~small float
// 2 ALU
//
float fastRcpSqrtNR0(float inX)
static inline float fastRcpSqrtNR0(float inX)
{
float xRcpSqrt = rcpSqrtIEEEIntApproximation(inX, IEEE_INT_RCP_SQRT_CONST_NR0);
return xRcpSqrt;
@ -131,7 +135,7 @@ float fastRcpSqrtNR0(float inX)
// Precise format : ~half float
// 6 ALU
//
float fastRcpSqrtNR1(float inX)
static inline float fastRcpSqrtNR1(float inX)
{
float xhalf = 0.5f * inX;
float xRcpSqrt = rcpSqrtIEEEIntApproximation(inX, IEEE_INT_RCP_SQRT_CONST_NR1);
@ -145,7 +149,7 @@ float fastRcpSqrtNR1(float inX)
// Precise format : ~full float
// 9 ALU
//
float fastRcpSqrtNR2(float inX)
static inline float fastRcpSqrtNR2(float inX)
{
float xhalf = 0.5f * inX;
float xRcpSqrt = rcpSqrtIEEEIntApproximation(inX, IEEE_INT_RCP_SQRT_CONST_NR2);
@ -158,7 +162,7 @@ float fastRcpSqrtNR2(float inX)
//
// SQRT
//
float sqrtIEEEIntApproximation(float inX, const int inSqrtConst)
static inline float sqrtIEEEIntApproximation(float inX, const int inSqrtConst)
{
union _float_int x;
x.f = inX;
@ -172,7 +176,7 @@ float sqrtIEEEIntApproximation(float inX, const int inSqrtConst)
// Precise format : ~small float
// 1 ALU
//
float fastSqrtNR0(float inX)
static inline float fastSqrtNR0(float inX)
{
float xRcp = sqrtIEEEIntApproximation(inX, IEEE_INT_SQRT_CONST_NR0);
return xRcp;
@ -185,7 +189,7 @@ float fastSqrtNR0(float inX)
// Precise format : ~half float
// 6 ALU
//
float fastSqrtNR1(float inX)
static inline float fastSqrtNR1(float inX)
{
// Inverse Rcp Sqrt
return inX * fastRcpSqrtNR1(inX);
@ -198,7 +202,7 @@ float fastSqrtNR1(float inX)
// Precise format : ~full float
// 9 ALU
//
float fastSqrtNR2(float inX)
static inline float fastSqrtNR2(float inX)
{
// Inverse Rcp Sqrt
return inX * fastRcpSqrtNR2(inX);
@ -208,7 +212,7 @@ float fastSqrtNR2(float inX)
// RCP
//
float rcpIEEEIntApproximation(float inX, const int inRcpConst)
static inline float rcpIEEEIntApproximation(float inX, const int inRcpConst)
{
union _float_int x;
x.f = inX;
@ -216,7 +220,7 @@ float rcpIEEEIntApproximation(float inX, const int inRcpConst)
return x.f;
}
float rcpNewtonRaphson(float inX, float inRcpX)
static inline float rcpNewtonRaphson(float inX, float inRcpX)
{
return inRcpX * (-inRcpX * inX + 2.0f);
}
@ -227,7 +231,7 @@ float rcpNewtonRaphson(float inX, float inRcpX)
// Precise format : ~small float
// 1 ALU
//
float fastRcpNR0(float inX)
static inline float fastRcpNR0(float inX)
{
float xRcp = rcpIEEEIntApproximation(inX, IEEE_INT_RCP_CONST_NR0);
return xRcp;
@ -239,7 +243,7 @@ float fastRcpNR0(float inX)
// Precise format : ~half float
// 3 ALU
//
float fastRcpNR1(float inX)
static inline float fastRcpNR1(float inX)
{
float xRcp = rcpIEEEIntApproximation(inX, IEEE_INT_RCP_CONST_NR1);
xRcp = rcpNewtonRaphson(inX, xRcp);
@ -252,7 +256,7 @@ float fastRcpNR1(float inX)
// Precise format : ~full float
// 5 ALU
//
float fastRcpNR2(float inX)
static inline float fastRcpNR2(float inX)
{
float xRcp = rcpIEEEIntApproximation(inX, IEEE_INT_RCP_CONST_NR2);
xRcp = rcpNewtonRaphson(inX, xRcp);
@ -271,7 +275,7 @@ static const float fsl_HALF_PI = 0.5f * 3.1415926535897932384626433f;
// 4 VGRP, 16 ALU Full Rate
// 7 * 10^-5 radians precision
// Reference : Handbook of Mathematical Functions (chapter : Elementary Transcendental Functions), M. Abramowitz and I.A. Stegun, Ed.
float acosFast4(float inX)
static inline float acosFast4(float inX)
{
float x1 = fabsf(inX);
float x2 = x1 * x1;
@ -291,7 +295,7 @@ float acosFast4(float inX)
// 4th order polynomial approximation
// 4 VGRP, 16 ALU Full Rate
// 7 * 10^-5 radians precision
float asinFast4(float inX)
static inline float asinFast4(float inX)
{
float x = inX;
@ -303,9 +307,13 @@ float asinFast4(float inX)
// 4 VGRP, 12 ALU Full Rate
// 7 * 10^-5 radians precision
// Reference : Efficient approximations for the arctangent function, Rajan, S. Sichun Wang Inkol, R. Joyal, A., May 2006
float atanFast4(float inX)
static inline float atanFast4(float inX)
{
float x = inX;
return x*(-0.1784f * fabsf(x) - 0.0663f * x * x + 1.0301f);
}
#ifdef __cplusplus
}
#endif //cplusplus
#endif //SHADER_FAST_MATH_INC_FX

Просмотреть файл

@ -3,6 +3,7 @@
#include "../NativePath.h"
#include "../NativeMemory.h"
#include "stddef.h"
#ifdef __cplusplus
extern "C" {

Просмотреть файл

@ -1,193 +0,0 @@
/*
Copyright (c) 2015 Giovanni Petrantoni
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
//
// NativeMath.c
// NativePath
//
// Created by Giovanni Petrantoni on 11/16/15.
// Copyright © 2015 Giovanni Petrantoni. All rights reserved.
//
#include <math.h>
#include "ShaderFastMathLib.h"
#include "lol_trig.h"
//ShaderFastMathLib
//
// Using 0 Newton Raphson iterations
// Relative error : ~3.4% over full
// Precise format : ~small float
// 2 ALU
//
float npFastRcpSqrtNR0(float inX)
{
return fastRcpSqrtNR0(inX);
}
//
// Using 1 Newton Raphson iterations
// Relative error : ~0.2% over full
// Precise format : ~half float
// 6 ALU
//
float npFastRcpSqrtNR1(float inX)
{
return fastRcpSqrtNR1(inX);
}
//
// Using 2 Newton Raphson iterations
// Relative error : ~4.6e-004% over full
// Precise format : ~full float
// 9 ALU
//
float npFastRcpSqrtNR2(float inX)
{
return fastRcpSqrtNR2(inX);
}
//
// Using 0 Newton Raphson iterations
// Relative error : < 0.7% over full
// Precise format : ~small float
// 1 ALU
//
float npFastSqrtNR0(float inX)
{
return fastSqrtNR0(inX);
}
//
// Use inverse Rcp Sqrt
// Using 1 Newton Raphson iterations
// Relative error : ~0.2% over full
// Precise format : ~half float
// 6 ALU
//
float npFastSqrtNR1(float inX)
{
return fastSqrtNR1(inX);
}
//
// Use inverse Rcp Sqrt
// Using 2 Newton Raphson iterations
// Relative error : ~4.6e-004% over full
// Precise format : ~full float
// 9 ALU
//
float npFastSqrtNR2(float inX)
{
return fastSqrtNR2(inX);
}
//
// Using 0 Newton Raphson iterations
// Relative error : < 0.4% over full
// Precise format : ~small float
// 1 ALU
//
float npFastRcpNR0(float inX)
{
return fastRcpNR0(inX);
}
//
// Using 1 Newton Raphson iterations
// Relative error : < 0.02% over full
// Precise format : ~half float
// 3 ALU
//
float npFastRcpNR1(float inX)
{
return fastRcpNR1(inX);
}
//
// Using 2 Newton Raphson iterations
// Relative error : < 5.0e-005% over full
// Precise format : ~full float
// 5 ALU
//
float npFastRcpNR2(float inX)
{
return fastRcpNR2(inX);
}
// 4th order polynomial approximation
// 4 VGRP, 16 ALU Full Rate
// 7 * 10^-5 radians precision
// Reference : Handbook of Mathematical Functions (chapter : Elementary Transcendental Functions), M. Abramowitz and I.A. Stegun, Ed.
float npAcosFast4(float inX)
{
return acosFast4(inX);
}
// 4th order polynomial approximation
// 4 VGRP, 16 ALU Full Rate
// 7 * 10^-5 radians precision
float npAsinFast4(float inX)
{
return asinFast4(inX);
}
// 4th order hyperbolical approximation
// 4 VGRP, 12 ALU Full Rate
// 7 * 10^-5 radians precision
// Reference : Efficient approximations for the arctangent function, Rajan, S. Sichun Wang Inkol, R. Joyal, A., May 2006
float npAtanFast4(float inX)
{
return atanFast4(inX);
}
double npLolFabs(double x)
{
return _lol_fabs(x);
}
double npLolSin(double x)
{
return _lol_sin(x);
}
double npLolCos(double x)
{
return _lol_cos(x);
}
void npLolSincos(double x, double *sinx, double *cosx)
{
_lol_sincos(x, sinx, cosx);
}
void npLolSincosf(float x, float *sinx, float *cosx)
{
_lol_sincosf(x, sinx, cosx);
}
double npLolTan(double x)
{
return _lol_tan(x);
}

Просмотреть файл

@ -1,426 +0,0 @@
//
// Lol Engine
//
// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
// This program is free software; you can redistribute it and/or
// modify it under the terms of the Do What The Fuck You Want To
// Public License, Version 2, as published by Sam Hocevar. See
// http://www.wtfpl.net/ for more details.
//
//#include <lol/engine-internal.h>
#include <cmath>
static const double D_PI = 3.1415926535897932384626433f;
#if defined HAVE_FASTMATH_H
# include <fastmath.h>
#endif
// Optimisation helpers
#if defined __GNUC__
# define __likely(x) __builtin_expect(!!(x), 1)
# define __unlikely(x) __builtin_expect(!!(x), 0)
# define INLINEATTR __attribute__((always_inline))
# if defined __x86_64__
# define FP_USE(x) __asm__("" : "+x" (x))
# elif defined __i386__ /* FIXME: this isn't good */
# define FP_USE(x) __asm__("" : "+m" (x))
# else
# define FP_USE(x) (void)(x)
# endif
#else
# define __likely(x) x
# define __unlikely(x) x
# define INLINEATTR
# define FP_USE(x) (void)(x)
#endif
namespace lol
{
static const double PI_2 = 1.57079632679489661923132;
static const double PI_4 = 0.785398163397448309615661;
static const double INV_PI = 0.318309886183790671537768;
static const double ROOT3 = 1.73205080756887729352745;
static const double ZERO = 0.0;
static const double ONE = 1.0;
static const double NEG_ONE = -1.0;
static const double HALF = 0.5;
static const double QUARTER = 0.25;
static const double TWO = 2.0;
#if defined __GNUC__
static const double VERY_SMALL_NUMBER = 0x1.0p-128;
#else
static const double VERY_SMALL_NUMBER = 3e-39;
#endif
static const double TWO_EXP_52 = 4503599627370496.0;
static const double TWO_EXP_54 = 18014398509481984.0;
/** sin Taylor series coefficients. */
static const double SC[] =
{
-1.6449340668482264364724e-0, // π^2/3!
+8.1174242528335364363700e-1, // π^4/5!
-1.9075182412208421369647e-1, // π^6/7!
+2.6147847817654800504653e-2, // π^8/9!
-2.3460810354558236375089e-3, // π^10/11!
+1.4842879303107100368487e-4, // π^12/13!
-6.9758736616563804745344e-6, // π^14/15!
+2.5312174041370276513517e-7, // π^16/17!
};
/* Note: the last value should be -1.3878952462213772114468e-7 (ie.
* π^18/18!) but we tweak it in order to get the better average precision
* required for tan() computations when close to π/2+ values. */
static const double CC[] =
{
-4.9348022005446793094172e-0, // π^2/2!
+4.0587121264167682181850e-0, // π^4/4!
-1.3352627688545894958753e-0, // π^6/6!
+2.3533063035889320454188e-1, // π^8/8!
-2.5806891390014060012598e-2, // π^10/10!
+1.9295743094039230479033e-3, // π^12/12!
-1.0463810492484570711802e-4, // π^14/14!
+4.3030695870329470072978e-6, // π^16/16!
-1.3777e-7,
};
/* These coefficients use Sloanes http://oeis.org/A002430 and
* http://oeis.org/A036279 sequences for the Taylor series of tan().
* Note: the last value should be 2.12485922978838540352881e5 (ie.
* 443861162*π^18/1856156927625), but we tweak it in order to get
* sub 1e-11 average precision in a larger range. */
static const double TC[] =
{
3.28986813369645287294483e0, // π^2/3
1.29878788045336582981920e1, // 2*π^4/15
5.18844961612069061254404e1, // 17*π^6/315
2.07509320280908496804928e2, // 62*π^8/2835
8.30024701695986756361561e2, // 1382*π^10/155925
3.32009324029001216460018e3, // 21844*π^12/6081075
1.32803704909665483598490e4, // 929569*π^14/638512875
5.31214808666037709352112e4, // 6404582*π^16/10854718875
2.373e5,
};
static inline double lol_fabs(double x) INLINEATTR;
#if defined __GNUC__
static inline double lol_round(double x) INLINEATTR;
static inline double lol_trunc(double x) INLINEATTR;
#endif
static inline double lol_fabs(double x)
{
#if defined __GNUC__
return __builtin_fabs(x);
#else
using std::fabs;
return fabs(x);
#endif
}
#if defined __GNUC__
static inline double lol_round(double x)
{
return __builtin_round(x);
}
static inline double lol_trunc(double x)
{
return __builtin_trunc(x);
}
#endif
double lol_sin(double x)
{
double absx = lol_fabs(x * INV_PI);
/* If branches are cheap, skip the cycle count when |x| < π/4,
* and only do the Taylor series up to the required precision. */
#if LOL_FEATURE_CHEAP_BRANCHES
if (absx < QUARTER)
{
/* Computing x^4 is one multiplication too many we do, but it helps
* interleave the Taylor series operations a lot better. */
double x2 = absx * absx;
double x4 = x2 * x2;
double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double taylor = sub2 * x2 + sub1;
return x * taylor;
}
#endif
/* Wrap |x| to the range [-1, 1] and keep track of the number of
* cycles required. If odd, we'll need to change the sign of the
* result. */
double num_cycles = absx + TWO_EXP_52;
FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
double is_even = TWO * num_cycles - ONE;
FP_USE(is_even); is_even += TWO_EXP_54;
FP_USE(is_even); is_even -= TWO_EXP_54;
FP_USE(is_even);
is_even -= TWO * num_cycles - ONE;
double sign = is_even;
absx -= num_cycles;
/* If branches are very cheap, we have the option to do the Taylor
* series at a much lower degree by splitting. */
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
if (lol_fabs(absx) > QUARTER)
{
sign = (x * absx >= 0.0) ? sign : -sign;
double x1 = HALF - lol_fabs(absx);
double x2 = x1 * x1;
double x4 = x2 * x2;
double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
double taylor = sub2 * x2 + sub1;
return taylor * sign;
}
#endif
sign *= (x >= 0.0) ? D_PI : -D_PI;
/* Compute a Tailor series for sin() and combine sign information. */
double x2 = absx * absx;
double x4 = x2 * x2;
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
#else
double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
#endif
double taylor = sub2 * x2 + sub1;
return absx * taylor * sign;
}
double lol_cos(double x)
{
double absx = lol_fabs(x * INV_PI);
#if LOL_FEATURE_CHEAP_BRANCHES
if (absx < QUARTER)
{
double x2 = absx * absx;
double x4 = x2 * x2;
double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
double taylor = (sub1 * x2 + sub2) * x2 + ONE;
return taylor;
}
#endif
double num_cycles = absx + TWO_EXP_52;
FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
double is_even = TWO * num_cycles - ONE;
FP_USE(is_even); is_even += TWO_EXP_54;
FP_USE(is_even); is_even -= TWO_EXP_54;
FP_USE(is_even);
is_even -= TWO * num_cycles - ONE;
double sign = is_even;
absx -= num_cycles;
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
if (lol_fabs(absx) > QUARTER)
{
double x1 = HALF - lol_fabs(absx);
double x2 = x1 * x1;
double x4 = x2 * x2;
double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double taylor = sub2 * x2 + sub1;
return x1 * taylor * sign * D_PI;
}
#endif
double x2 = absx * absx;
double x4 = x2 * x2;
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
#else
double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
#endif
double taylor = sub2 * x2 + sub1;
return taylor * sign;
}
void lol_sincos(double x, double *sinx, double *cosx)
{
double absx = lol_fabs(x * INV_PI);
#if LOL_FEATURE_CHEAP_BRANCHES
if (absx < QUARTER)
{
double x2 = absx * absx;
double x4 = x2 * x2;
/* Computing the Taylor series to the 11th order is enough to get
* x * 1e-11 precision, but we push it to the 13th order so that
* tan() has a better precision. */
double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double taylors = subs2 * x2 + subs1;
*sinx = x * taylors;
double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
*cosx = taylorc;
return;
}
#endif
double num_cycles = absx + TWO_EXP_52;
FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
double is_even = TWO * num_cycles - ONE;
FP_USE(is_even); is_even += TWO_EXP_54;
FP_USE(is_even); is_even -= TWO_EXP_54;
FP_USE(is_even);
is_even -= TWO * num_cycles - ONE;
double sin_sign = is_even;
double cos_sign = is_even;
absx -= num_cycles;
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
if (lol_fabs(absx) > QUARTER)
{
cos_sign = sin_sign;
sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
double x1 = HALF - lol_fabs(absx);
double x2 = x1 * x1;
double x4 = x2 * x2;
double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
double taylors = subs2 * x2 + subs1;
*sinx = taylors * sin_sign;
double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double taylorc = subc2 * x2 + subc1;
*cosx = x1 * taylorc * cos_sign * D_PI;
return;
}
#endif
sin_sign *= (x >= 0.0) ? D_PI : -D_PI;
double x2 = absx * absx;
double x4 = x2 * x2;
#if LOL_FEATURE_VERY_CHEAP_BRANCHES
double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
#else
double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
/* Push Taylor series to the 19th order to enhance tan() accuracy. */
double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
#endif
double taylors = subs2 * x2 + subs1;
*sinx = absx * taylors * sin_sign;
double taylorc = subc2 * x2 + subc1;
*cosx = taylorc * cos_sign;
}
void lol_sincos(float x, float *sinx, float *cosx)
{
double x2 = static_cast<double>(x);
double s2, c2;
lol_sincos(x2, &s2, &c2);
*sinx = static_cast<float>(s2);
*cosx = static_cast<float>(c2);
}
double lol_tan(double x)
{
#if LOL_FEATURE_CHEAP_BRANCHES
double absx = lol_fabs(x * INV_PI);
/* This value was determined empirically to ensure an error of no
* more than x * 1e-11 in this range. */
if (absx < 0.163)
{
double x2 = absx * absx;
double x4 = x2 * x2;
double sub1 = (((TC[7] * x4 + TC[5]) * x4
+ TC[3]) * x4 + TC[1]) * x4 + ONE;
double sub2 = (((TC[8] * x4 + TC[6]) * x4
+ TC[4]) * x4 + TC[2]) * x4 + TC[0];
double taylor = sub2 * x2 + sub1;
return x * taylor;
}
#endif
double sinx, cosx;
lol_sincos(x, &sinx, &cosx);
/* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
double absc = lol_fabs(cosx);
if (__unlikely(absc < VERY_SMALL_NUMBER))
cosx = VERY_SMALL_NUMBER;
return sinx / cosx;
}
} /* namespace lol */
#ifdef __cplusplus
extern "C"
{
double _lol_fabs(double x)
{
return lol::lol_fabs(x);
}
double _lol_sin(double x)
{
return lol::lol_sin(x);
}
double _lol_cos(double x)
{
return lol::lol_cos(x);
}
void _lol_sincos(double x, double *sinx, double *cosx)
{
lol::lol_sincos(x, sinx, cosx);
}
void _lol_sincosf(float x, float *sinx, float *cosx)
{
lol::lol_sincos(x, sinx, cosx);
}
double _lol_tan(double x)
{
return lol::lol_tan(x);
}
}
#endif

Просмотреть файл

@ -1,21 +0,0 @@
//
// lol_trig.h
// NativePath
//
// Created by Void on 11/17/15.
// Copyright © 2015 Voidtarget. All rights reserved.
//
#ifndef lol_trig_h
#define lol_trig_h
double _lol_fabs(double x);
double _lol_round(double x);
double _lol_trunc(double x);
double _lol_sin(double x);
double _lol_cos(double x);
void _lol_sincos(double x, double *sinx, double *cosx);
void _lol_sincosf(float x, float *sinx, float *cosx);
double _lol_tan(double x);
#endif /* lol_trig_h */

Просмотреть файл

@ -17,7 +17,7 @@ function BuildWindows32DLL(cfile, isCpp)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
if isCpp then flags = flags.." -std=c++1z " end
local cmd = "clang -v -m32 -DNP_WIN32 -Wall -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -v -m32 -DNP_WIN32 -Wall -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -38,14 +38,19 @@ end
--LLVM bytecode
function BuildLLVM32(cfile)
function BuildLLVM32(cfile, isCpp)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -m32 -nostdlibinc -nobuiltininc -nostdinc++ -fno-exceptions "..common_flags.." "..flags.." -o "..cfile..".ll ".." -S -c -emit-llvm -target i386-unknown "..cfile;
if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
local cmd = "clang -DNP_LLVM_BC -m32 -nostdlibinc -nobuiltininc -nostdinc++ "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target i386-unknown "..cfile
local cmdLL = "clang -DNP_LLVM_BC -m32 -nostdlibinc -nobuiltininc -nostdinc++ "..common_flags.." "..flags.." -o "..cfile..".ll ".." -S -c -emit-llvm -target i386-unknown "..cfile
local cmdPP = "clang -DNP_LLVM_BC -m32 -nostdlibinc -nobuiltininc -nostdinc++ "..common_flags.." "..flags.." -E "..cfile.." > "..cfile..".pp"
if is_verbose == true then
print(cmd)
end
if os.execute(cmd) == 0 then table.insert(objs, cfile..".bc") end
os.execute(cmdLL)
os.execute(cmdPP)
end
function LinkLLVM32()
@ -53,17 +58,110 @@ function LinkLLVM32()
for i, o in ipairs(objs) do
objs_str = objs_str..o.." "
end
local cmd = "llvm-link -o LLVM32\\"..outputName..".bc "..objs_str
local cmd = "llvm-link -o LLVM\\"..outputName.."-i386.bc "..objs_str
if is_verbose == true then
print(cmd)
end
os.execute(cmd)
end
function BuildLLVM64(cfile)
function BuildLLVMarmv7(cfile, isCpp)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -m64 -nostdlibinc -nobuiltininc -nostdinc++ -fno-exceptions "..common_flags.." "..flags.." -o "..cfile..".ll ".." -S -c -emit-llvm -target x86_64-unknown "..cfile;
if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
local cmd = "clang -DNP_LLVM_BC -nostdlibinc -nobuiltininc -nostdinc++ -mfpu=neon -mfloat-abi=hard "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target armv7-unknown "..cfile
if is_verbose == true then
print(cmd)
end
if os.execute(cmd) == 0 then table.insert(objs, cfile..".bc") end
end
function LinkLLVMarmv7()
local objs_str = ""
for i, o in ipairs(objs) do
objs_str = objs_str..o.." "
end
local cmd = "llvm-link -o LLVM\\"..outputName.."-armv7.bc "..objs_str
if is_verbose == true then
print(cmd)
end
os.execute(cmd)
end
function BuildLLVMarmv7s(cfile, isCpp)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
local cmd = "clang -DNP_LLVM_BC -nostdlibinc -nobuiltininc -nostdinc++ -mfpu=neon -mfloat-abi=hard "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target armv7s-unknown "..cfile
if is_verbose == true then
print(cmd)
end
if os.execute(cmd) == 0 then table.insert(objs, cfile..".bc") end
end
function LinkLLVMarmv7s()
local objs_str = ""
for i, o in ipairs(objs) do
objs_str = objs_str..o.." "
end
local cmd = "llvm-link -o LLVM\\"..outputName.."-armv7s.bc "..objs_str
if is_verbose == true then
print(cmd)
end
os.execute(cmd)
end
function BuildLLVMAArch64(cfile, isCpp)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
local cmd = "clang -DNP_LLVM_BC -nostdlibinc -nobuiltininc -nostdinc++ "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target aarch64-unknown "..cfile
if is_verbose == true then
print(cmd)
end
if os.execute(cmd) == 0 then table.insert(objs, cfile..".bc") end
end
function LinkLLVMAArch64()
local objs_str = ""
for i, o in ipairs(objs) do
objs_str = objs_str..o.." "
end
local cmd = "llvm-link -o LLVM\\"..outputName.."-aarch64.bc "..objs_str
if is_verbose == true then
print(cmd)
end
os.execute(cmd)
end
function BuildLLVMarmv6(cfile, isCpp)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
local cmd = "clang -DNP_LLVM_BC -nostdlibinc -nobuiltininc -nostdinc++ -mfloat-abi=hard -mfpu=vfp "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target armv6-unknown "..cfile
if is_verbose == true then
print(cmd)
end
if os.execute(cmd) == 0 then table.insert(objs, cfile..".bc") end
end
function LinkLLVMarmv6()
local objs_str = ""
for i, o in ipairs(objs) do
objs_str = objs_str..o.." "
end
local cmd = "llvm-link -o LLVM\\"..outputName.."-armv6.bc "..objs_str
if is_verbose == true then
print(cmd)
end
os.execute(cmd)
end
function BuildLLVM64(cfile, isCpp)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
local cmd = "clang -DNP_LLVM_BC -m64 -nostdlibinc -nobuiltininc -nostdinc++ "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target x86_64-unknown "..cfile
if is_verbose == true then
print(cmd)
end
@ -75,7 +173,7 @@ function LinkLLVM64()
for i, o in ipairs(objs) do
objs_str = objs_str..o.." "
end
local cmd = "llvm-link -o LLVM64\\"..outputName..".bc "..objs_str
local cmd = "llvm-link -o LLVM\\"..outputName.."-x86_64.bc "..objs_str
if is_verbose == true then
print(cmd)
end
@ -87,7 +185,7 @@ end
function BuildWindows32(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -m32 -DNP_WIN32 -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -m32 -DNP_WIN32 -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -109,7 +207,7 @@ end
function BuildWindows64(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -m64 -DNP_WIN32 -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -m64 -DNP_WIN32 -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -155,7 +253,7 @@ end
function BuildWindowsUWP64(cfile)
local flags = ""
if debug then flags = debug_ms_flags else flags = release_ms_flags end
local cmd = "clang-cl -DNP_WIN32 -WX -EHsc -GS- -MD -DWIN_EXPORT -m64 "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang-cl -DNP_WIN32 -WX -EHsc -GS- -MD -DWIN_EXPORT -m64 "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -177,7 +275,7 @@ end
function BuildWindowsUWPARM(cfile)
local flags = ""
if debug then flags = debug_ms_flags else flags = release_ms_flags end
local cmd = "clang-cl -DNP_WIN32 -WX -EHsc -GS- -MD -DWIN_EXPORT -m32 --target=thumbv7-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang-cl -DNP_WIN32 -WX -EHsc -GS- -MD -DWIN_EXPORT -m32 --target=thumbv7-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -266,7 +364,7 @@ function BuildIOSArm7(cfile, isCpp)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
if isCpp then flags = flags.." -std=c++1z " end
local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target armv7-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target armv7-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -291,7 +389,7 @@ end
function BuildIOSArm7s(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target armv7s-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target armv7s-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -313,7 +411,7 @@ end
function BuildIOSArm64(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target arm64-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target arm64-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -335,7 +433,7 @@ end
function BuildIOSx86(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target i386-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target i386-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -357,7 +455,7 @@ end
function BuildIOSx64(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target x86_64-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target x86_64-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -381,7 +479,7 @@ end
function BuildMacOSx86(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_MACOS -nostdlibinc -nobuiltininc -nostdinc++ -mmacosx-version-min=10.5 -target i386-apple-macosx "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_MACOS -nostdlibinc -nobuiltininc -nostdinc++ -mmacosx-version-min=10.5 -target i386-apple-macosx "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -403,7 +501,7 @@ end
function BuildMacOSx64(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_MACOS -nostdlibinc -nobuiltininc -nostdinc++ -mmacosx-version-min=10.5 -target x86_64-apple-macosx "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_MACOS -nostdlibinc -nobuiltininc -nostdinc++ -mmacosx-version-min=10.5 -target x86_64-apple-macosx "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -427,7 +525,7 @@ end
function BuildAndroidArm(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target arm-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target arm-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -449,7 +547,7 @@ end
function BuildAndroidArm7(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target armv7-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target armv7-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -471,7 +569,7 @@ end
function BuildAndroidArm64(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target aarch64-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target aarch64-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -493,7 +591,7 @@ end
function BuildAndroidx86(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target i386-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target i386-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -518,7 +616,7 @@ end
function BuildAndroidx64(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target x86_64-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target x86_64-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -542,7 +640,7 @@ end
function BuildLinuxX64(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_LINUX -nostdlibinc -nobuiltininc -nostdinc++ -fPIC -target x86_64-linux-gnu "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_LINUX -nostdlibinc -nobuiltininc -nostdinc++ -fPIC -target x86_64-linux-gnu "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -564,7 +662,7 @@ end
function BuildLinuxX86(cfile)
local flags = ""
if debug then flags = debug_flags else flags = release_flags end
local cmd = "clang -DNP_LINUX -nostdlibinc -nobuiltininc -nostdinc++ -fPIC -target i386-linux-gnu "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
local cmd = "clang -DNP_LINUX -nostdlibinc -nobuiltininc -nostdinc++ -fPIC -target i386-linux-gnu "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
if is_verbose == true then
print(cmd)
end
@ -767,23 +865,67 @@ elseif platform == "macos" then
os.remove("macOS\\"..outputName.."_x86_64.a")
elseif platform == "llvm" then
lfs.mkdir("LLVM32")
lfs.mkdir("LLVM")
objs = {}
print ("Building LLVM x86...")
for i,f in ipairs(cfiles) do
BuildLLVM32(f)
BuildLLVM32(f, false)
end
for i,f in ipairs(cppfiles) do
BuildLLVM32(f, true)
end
LinkLLVM32()
lfs.mkdir("LLVM64")
objs = {}
print ("Building LLVM x64...")
for i,f in ipairs(cfiles) do
BuildLLVM64(f)
BuildLLVM64(f, false)
end
for i,f in ipairs(cppfiles) do
BuildLLVM64(f, true)
end
LinkLLVM64()
objs = {}
print ("Building LLVM armv6...")
for i,f in ipairs(cfiles) do
BuildLLVMarmv6(f, false)
end
for i,f in ipairs(cppfiles) do
BuildLLVMarmv6(f, true)
end
LinkLLVMarmv6()
objs = {}
print ("Building LLVM armv7...")
for i,f in ipairs(cfiles) do
BuildLLVMarmv7(f, false)
end
for i,f in ipairs(cppfiles) do
BuildLLVMarmv7(f, true)
end
LinkLLVMarmv7()
objs = {}
print ("Building LLVM armv7s...")
for i,f in ipairs(cfiles) do
BuildLLVMarmv7s(f, false)
end
for i,f in ipairs(cppfiles) do
BuildLLVMarmv7s(f, true)
end
LinkLLVMarmv7s()
objs = {}
print ("Building LLVM AArch64...")
for i,f in ipairs(cfiles) do
BuildLLVMAArch64(f, false)
end
for i,f in ipairs(cppfiles) do
BuildLLVMAArch64(f, true)
end
LinkLLVMAArch64()
elseif platform == "linux" then
lfs.mkdir("Linux")