WIP on LLVM targets, got all of them working so far!

2017-05-15 20:04:49 +09:00 · 2017-05-15 20:04:49 +09:00 · bdbd09de46
--- a/NativePath/NativeMath.h
+++ b/NativePath/NativeMath.h
@ -31,113 +31,325 @@ THE SOFTWARE.
 #ifndef nativemath_h
 #define nativemath_h

-#include "NativePath.h"
+#include "ShaderFastMathLib.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

-//ShaderFastMathLib
-
-//
-// Using 0 Newton Raphson iterations
-// Relative error : ~3.4% over full
-// Precise format : ~small float
-// 2 ALU
-//
-extern float npFastRcpSqrtNR0(float inX);
-
-//
-// Using 1 Newton Raphson iterations
-// Relative error : ~0.2% over full
-// Precise format : ~half float
-// 6 ALU
-//
-extern float npFastRcpSqrtNR1(float inX);
-
-//
-// Using 2 Newton Raphson iterations
-// Relative error : ~4.6e-004%  over full
-// Precise format : ~full float
-// 9 ALU
-//
-extern float npFastRcpSqrtNR2(float inX);
-
-//
-// Using 0 Newton Raphson iterations
-// Relative error : < 0.7% over full
-// Precise format : ~small float
-// 1 ALU
-//
-extern float npFastSqrtNR0(float inX);
-
-//
-// Use inverse Rcp Sqrt
-// Using 1 Newton Raphson iterations
-// Relative error : ~0.2% over full
-// Precise format : ~half float
-// 6 ALU
-//
-extern float npFastSqrtNR1(float inX);
-
-//
-// Use inverse Rcp Sqrt
-// Using 2 Newton Raphson iterations
-// Relative error : ~4.6e-004%  over full
-// Precise format : ~full float
-// 9 ALU
-//
-extern float npFastSqrtNR2(float inX);
-
-//
-// Using 0 Newton Raphson iterations
-// Relative error : < 0.4% over full
-// Precise format : ~small float
-// 1 ALU
-//
-extern float npFastRcpNR0(float inX);
-
-//
-// Using 1 Newton Raphson iterations
-// Relative error : < 0.02% over full
-// Precise format : ~half float
-// 3 ALU
-//
-extern float npFastRcpNR1(float inX);
-
-//
-// Using 2 Newton Raphson iterations
-// Relative error : < 5.0e-005%  over full
-// Precise format : ~full float
-// 5 ALU
-//
-extern float npFastRcpNR2(float inX);
-
-// 4th order polynomial approximation
-// 4 VGRP, 16 ALU Full Rate
-// 7 * 10^-5 radians precision
-// Reference : Handbook of Mathematical Functions (chapter : Elementary Transcendental Functions), M. Abramowitz and I.A. Stegun, Ed.
-extern float npAcosFast4(float inX);
-
-// 4th order polynomial approximation
-// 4 VGRP, 16 ALU Full Rate
-// 7 * 10^-5 radians precision
-extern float npAsinFast4(float inX);
-
-// 4th order hyperbolical approximation
-// 4 VGRP, 12 ALU Full Rate
-// 7 * 10^-5 radians precision
-// Reference : Efficient approximations for the arctangent function, Rajan, S. Sichun Wang Inkol, R. Joyal, A., May 2006
-extern float npAtanFast4(float inX);
-
 //LoL engine fast math

-extern double npLolFabs(double x);
-extern double npLolSin(double x);
-extern double npLolCos(double x);
-extern void npLolSincos(double x, double *sinx, double *cosx);
-extern void npLolSincosf(float x, float *sinx, float *cosx);
-extern double npLolTan(double x);
+#define FP_USE(x) (void)(x)
+#define __likely(x) __builtin_expect(!!(x), 1)
+#define __unlikely(x) __builtin_expect(!!(x), 0)
+
+static const double D_PI = 3.1415926535897932384626433f;
+
+static const double PI_2   = 1.57079632679489661923132;
+static const double PI_4   = 0.785398163397448309615661;
+static const double INV_PI = 0.318309886183790671537768;
+static const double ROOT3  = 1.73205080756887729352745;
+
+static const double ZERO    = 0.0;
+static const double ONE     = 1.0;
+static const double NEG_ONE = -1.0;
+static const double HALF    = 0.5;
+static const double QUARTER = 0.25;
+static const double TWO     = 2.0;
+static const double VERY_SMALL_NUMBER = 0x1.0p-128;
+static const double TWO_EXP_52 = 4503599627370496.0;
+static const double TWO_EXP_54 = 18014398509481984.0;
+
+/** sin Taylor series coefficients. */
+static const double SC[] =
+{
+	-1.6449340668482264364724e-0, // π^2/3!
+	+8.1174242528335364363700e-1, // π^4/5!
+	-1.9075182412208421369647e-1, // π^6/7!
+	+2.6147847817654800504653e-2, // π^8/9!
+	-2.3460810354558236375089e-3, // π^10/11!
+	+1.4842879303107100368487e-4, // π^12/13!
+	-6.9758736616563804745344e-6, // π^14/15!
+	+2.5312174041370276513517e-7, // π^16/17!
+};
+
+/* Note: the last value should be -1.3878952462213772114468e-7 (ie.
+ * π^18/18!) but we tweak it in order to get the better average precision
+ * required for tan() computations when close to π/2+kπ values. */
+static const double CC[] =
+{
+	-4.9348022005446793094172e-0, // π^2/2!
+	+4.0587121264167682181850e-0, // π^4/4!
+	-1.3352627688545894958753e-0, // π^6/6!
+	+2.3533063035889320454188e-1, // π^8/8!
+	-2.5806891390014060012598e-2, // π^10/10!
+	+1.9295743094039230479033e-3, // π^12/12!
+	-1.0463810492484570711802e-4, // π^14/14!
+	+4.3030695870329470072978e-6, // π^16/16!
+	-1.3777e-7,
+};
+
+/* These coefficients use Sloane’s http://oeis.org/A002430 and
+ * http://oeis.org/A036279 sequences for the Taylor series of tan().
+ * Note: the last value should be 2.12485922978838540352881e5 (ie.
+ * 443861162*π^18/1856156927625), but we tweak it in order to get
+ * sub 1e-11 average precision in a larger range. */
+static const double TC[] =
+{
+	3.28986813369645287294483e0, // π^2/3
+	1.29878788045336582981920e1, // 2*π^4/15
+	5.18844961612069061254404e1, // 17*π^6/315
+	2.07509320280908496804928e2, // 62*π^8/2835
+	8.30024701695986756361561e2, // 1382*π^10/155925
+	3.32009324029001216460018e3, // 21844*π^12/6081075
+	1.32803704909665483598490e4, // 929569*π^14/638512875
+	5.31214808666037709352112e4, // 6404582*π^16/10854718875
+	2.373e5,
+};
+
+static inline double npLolSin(double x)
+{
+	double absx = __builtin_fabs(x * INV_PI);
+	
+	/* If branches are cheap, skip the cycle count when |x| < π/4,
+	 * and only do the Taylor series up to the required precision. */
+#if LOL_FEATURE_CHEAP_BRANCHES
+	if (absx < QUARTER)
+	{
+		/* Computing x^4 is one multiplication too many we do, but it helps
+		 * interleave the Taylor series operations a lot better. */
+		double x2 = absx * absx;
+		double x4 = x2 * x2;
+		double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
+		double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
+		double taylor = sub2 * x2 + sub1;
+		return x * taylor;
+	}
+#endif
+	
+	/* Wrap |x| to the range [-1, 1] and keep track of the number of
+	 * cycles required. If odd, we'll need to change the sign of the
+	 * result. */
+	double num_cycles = absx + TWO_EXP_52;
+	FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
+	
+	double is_even = TWO * num_cycles - ONE;
+	FP_USE(is_even); is_even += TWO_EXP_54;
+	FP_USE(is_even); is_even -= TWO_EXP_54;
+	FP_USE(is_even);
+	is_even -= TWO * num_cycles - ONE;
+	double sign = is_even;
+	
+	absx -= num_cycles;
+	
+	/* If branches are very cheap, we have the option to do the Taylor
+	 * series at a much lower degree by splitting. */
+#if LOL_FEATURE_VERY_CHEAP_BRANCHES
+	if (__builtin_fabs(absx) > QUARTER)
+	{
+		sign = (x * absx >= 0.0) ? sign : -sign;
+		
+		double x1 = HALF - __builtin_fabs(absx);
+		double x2 = x1 * x1;
+		double x4 = x2 * x2;
+		double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
+		double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
+		double taylor = sub2 * x2 + sub1;
+		
+		return taylor * sign;
+	}
+#endif
+	
+	sign *= (x >= 0.0) ? D_PI : -D_PI;
+	
+	/* Compute a Tailor series for sin() and combine sign information. */
+	double x2 = absx * absx;
+	double x4 = x2 * x2;
+#if LOL_FEATURE_VERY_CHEAP_BRANCHES
+	double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
+	double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
+#else
+	double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
+	double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
+#endif
+	double taylor = sub2 * x2 + sub1;
+	
+	return absx * taylor * sign;
+}
+
+static inline double npLolCos(double x)
+{
+	double absx = __builtin_fabs(x * INV_PI);
+	
+#if LOL_FEATURE_CHEAP_BRANCHES
+	if (absx < QUARTER)
+	{
+		double x2 = absx * absx;
+		double x4 = x2 * x2;
+		double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
+		double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
+		double taylor = (sub1 * x2 + sub2) * x2 + ONE;
+		return taylor;
+	}
+#endif
+	
+	double num_cycles = absx + TWO_EXP_52;
+	FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
+	
+	double is_even = TWO * num_cycles - ONE;
+	FP_USE(is_even); is_even += TWO_EXP_54;
+	FP_USE(is_even); is_even -= TWO_EXP_54;
+	FP_USE(is_even);
+	is_even -= TWO * num_cycles - ONE;
+	double sign = is_even;
+	
+	absx -= num_cycles;
+	
+#if LOL_FEATURE_VERY_CHEAP_BRANCHES
+	if (__builtin_fabs(absx) > QUARTER)
+	{
+		double x1 = HALF - __builtin_fabs(absx);
+		double x2 = x1 * x1;
+		double x4 = x2 * x2;
+		double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
+		double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
+		double taylor = sub2 * x2 + sub1;
+		
+		return x1 * taylor * sign * D_PI;
+	}
+#endif
+	
+	double x2 = absx * absx;
+	double x4 = x2 * x2;
+#if LOL_FEATURE_VERY_CHEAP_BRANCHES
+	double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
+	double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
+#else
+	double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
+	double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
+#endif
+	double taylor = sub2 * x2 + sub1;
+	
+	return taylor * sign;
+}
+
+static inline void npLolSincos(double x, double *sinx, double *cosx)
+{
+	double absx = __builtin_fabs(x * INV_PI);
+	
+#if LOL_FEATURE_CHEAP_BRANCHES
+	if (absx < QUARTER)
+	{
+		double x2 = absx * absx;
+		double x4 = x2 * x2;
+		
+		/* Computing the Taylor series to the 11th order is enough to get
+		 * x * 1e-11 precision, but we push it to the 13th order so that
+		 * tan() has a better precision. */
+		double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
+		double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
+		double taylors = subs2 * x2 + subs1;
+		*sinx = x * taylors;
+		
+		double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
+		double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
+		double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
+		*cosx = taylorc;
+		
+		return;
+	}
+#endif
+	
+	double num_cycles = absx + TWO_EXP_52;
+	FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
+	
+	double is_even = TWO * num_cycles - ONE;
+	FP_USE(is_even); is_even += TWO_EXP_54;
+	FP_USE(is_even); is_even -= TWO_EXP_54;
+	FP_USE(is_even);
+	is_even -= TWO * num_cycles - ONE;
+	double sin_sign = is_even;
+	double cos_sign = is_even;
+	
+	absx -= num_cycles;
+	
+#if LOL_FEATURE_VERY_CHEAP_BRANCHES
+	if (__builtin_fabs(absx) > QUARTER)
+	{
+		cos_sign = sin_sign;
+		sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
+		
+		double x1 = HALF - __builtin_fabs(absx);
+		double x2 = x1 * x1;
+		double x4 = x2 * x2;
+		
+		double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
+		double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
+		double taylors = subs2 * x2 + subs1;
+		*sinx = taylors * sin_sign;
+		
+		double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
+		double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
+		double taylorc = subc2 * x2 + subc1;
+		*cosx = x1 * taylorc * cos_sign * D_PI;
+		
+		return;
+	}
+#endif
+	
+	sin_sign *= (x >= 0.0) ? D_PI : -D_PI;
+	
+	double x2 = absx * absx;
+	double x4 = x2 * x2;
+#if LOL_FEATURE_VERY_CHEAP_BRANCHES
+	double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
+	double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
+	double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
+	double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
+#else
+	double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
+	double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
+	/* Push Taylor series to the 19th order to enhance tan() accuracy. */
+	double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
+	double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
+#endif
+	double taylors = subs2 * x2 + subs1;
+	*sinx = absx * taylors * sin_sign;
+	
+	double taylorc = subc2 * x2 + subc1;
+	*cosx = taylorc * cos_sign;
+}
+    
+static inline double npLolTan(double x)
+{
+#if LOL_FEATURE_CHEAP_BRANCHES
+	double absx = __builtin_fabs(x * INV_PI);
+	
+	/* This value was determined empirically to ensure an error of no
+	 * more than x * 1e-11 in this range. */
+	if (absx < 0.163)
+	{
+		double x2 = absx * absx;
+		double x4 = x2 * x2;
+		double sub1 = (((TC[7] * x4 + TC[5]) * x4
+						+ TC[3]) * x4 + TC[1]) * x4 + ONE;
+		double sub2 = (((TC[8] * x4 + TC[6]) * x4
+						+ TC[4]) * x4 + TC[2]) * x4 + TC[0];
+		double taylor = sub2 * x2 + sub1;
+		return x * taylor;
+	}
+#endif
+	
+	double sinx, cosx;
+	npLolSincos(x, &sinx, &cosx);
+	
+	/* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
+	double absc = __builtin_fabs(cosx);
+	
+	if (__unlikely(absc < VERY_SMALL_NUMBER))
+		cosx = VERY_SMALL_NUMBER;
+	return sinx / cosx;
+}

 //Utility OpenCL vector goodies

@ -151,7 +363,7 @@ static inline float4 npTransformNormalF4(float4 normal, float4 matrix[4])
    return normal.xxxx * matrix[0].xyzw + normal.yyyy * matrix[1].xyzw + normal.zzzz * matrix[2].xyzw + normal.wwww * matrix[3].xyzw;
 }

-static void npMatrixIdentityF4(float4* outMatrix)
+static inline void npMatrixIdentityF4(float4* outMatrix)
 {
    outMatrix[0].yzw = 0.0f;
    outMatrix[1].xzw = 0.0f;
@ -163,14 +375,9 @@ static void npMatrixIdentityF4(float4* outMatrix)
    outMatrix[3].w = 1.0f;
 }

-static inline float sqrtf(float x)
+static inline float npLengthF4(float4 vec)
 {
-	return sqrt(x);
-}
-
-static float npLengthF4(float4 vec)
-{
-    return sqrtf(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z + vec.w * vec.w);
+    return __builtin_sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z + vec.w * vec.w);
 }

 #ifdef __cplusplus
--- a/NativePath/NativePath.h
+++ b/NativePath/NativePath.h
@ -1591,16 +1591,13 @@ typedef uint32_t uint4 __attribute__((ext_vector_type(4)));
 	#define sqrt __builtin_sqrt
 #endif

-/* #if !__has_builtin(__builtin_sqrtf)
+#if !__has_builtin(__builtin_sqrtf)
 	#error \"sqrtf clang built-in not available\"
 	// ff
 	extern float sqrtf(...);
 #else
 	#define sqrtf __builtin_sqrtf
-#endif */
-
-#undef sqrtf
-#define sqrtf sqrt
+#endif

 #if !__has_builtin(__builtin_sqrtl)
 	#error \"sqrtl clang built-in not available\"
--- a/NativePath/ShaderFastMathLib.h
+++ b/NativePath/ShaderFastMathLib.h
@ -56,13 +56,17 @@
 #ifndef SHADER_FAST_MATH_INC_FX
 #define SHADER_FAST_MATH_INC_FX

+#ifdef __cplusplus
+extern "C"
+#endif
+
 union _float_int
 {
 	int i;
 	float f;
 };

-#include <math.h>
+#include <NativePath.h>

 // Derived from batch testing
 // TODO : Should be improved
@ -100,7 +104,7 @@ union _float_int
 //

 // Approximate guess using integer float arithmetics based on IEEE floating point standard
-float rcpSqrtIEEEIntApproximation(float inX, const int inRcpSqrtConst)
+static inline float rcpSqrtIEEEIntApproximation(float inX, const int inRcpSqrtConst)
 {
 	union _float_int x;
 	x.f = inX;
@ -108,7 +112,7 @@ float rcpSqrtIEEEIntApproximation(float inX, const int inRcpSqrtConst)
 	return x.f;
 }

-float rcpSqrtNewtonRaphson(float inXHalf, float inRcpX)
+static inline float rcpSqrtNewtonRaphson(float inXHalf, float inRcpX)
 {
    return inRcpX * (-inXHalf * (inRcpX * inRcpX) + 1.5f);
 }
@ -119,7 +123,7 @@ float rcpSqrtNewtonRaphson(float inXHalf, float inRcpX)
 // Precise format : ~small float
 // 2 ALU
 //
-float fastRcpSqrtNR0(float inX)
+static inline float fastRcpSqrtNR0(float inX)
 {
    float  xRcpSqrt = rcpSqrtIEEEIntApproximation(inX, IEEE_INT_RCP_SQRT_CONST_NR0);
    return xRcpSqrt;
@ -131,7 +135,7 @@ float fastRcpSqrtNR0(float inX)
 // Precise format : ~half float
 // 6 ALU
 //
-float fastRcpSqrtNR1(float inX)
+static inline float fastRcpSqrtNR1(float inX)
 {
    float  xhalf = 0.5f * inX;
    float  xRcpSqrt = rcpSqrtIEEEIntApproximation(inX, IEEE_INT_RCP_SQRT_CONST_NR1);
@ -145,7 +149,7 @@ float fastRcpSqrtNR1(float inX)
 // Precise format : ~full float
 // 9 ALU
 //
-float fastRcpSqrtNR2(float inX)
+static inline float fastRcpSqrtNR2(float inX)
 {
    float  xhalf = 0.5f * inX;
    float  xRcpSqrt = rcpSqrtIEEEIntApproximation(inX, IEEE_INT_RCP_SQRT_CONST_NR2);
@ -158,7 +162,7 @@ float fastRcpSqrtNR2(float inX)
 //
 // SQRT
 //
-float sqrtIEEEIntApproximation(float inX, const int inSqrtConst)
+static inline float sqrtIEEEIntApproximation(float inX, const int inSqrtConst)
 {
 	union _float_int x;
 	x.f = inX;
@ -172,7 +176,7 @@ float sqrtIEEEIntApproximation(float inX, const int inSqrtConst)
 // Precise format : ~small float
 // 1 ALU
 //
-float fastSqrtNR0(float inX)
+static inline float fastSqrtNR0(float inX)
 {
    float  xRcp = sqrtIEEEIntApproximation(inX, IEEE_INT_SQRT_CONST_NR0);
    return xRcp;
@ -185,7 +189,7 @@ float fastSqrtNR0(float inX)
 // Precise format : ~half float
 // 6 ALU
 //
-float fastSqrtNR1(float inX)
+static inline float fastSqrtNR1(float inX)
 {
    // Inverse Rcp Sqrt
    return inX * fastRcpSqrtNR1(inX);
@ -198,7 +202,7 @@ float fastSqrtNR1(float inX)
 // Precise format : ~full float
 // 9 ALU
 //
-float fastSqrtNR2(float inX)
+static inline float fastSqrtNR2(float inX)
 {
    // Inverse Rcp Sqrt
    return inX * fastRcpSqrtNR2(inX);
@ -208,7 +212,7 @@ float fastSqrtNR2(float inX)
 // RCP
 //

-float rcpIEEEIntApproximation(float inX, const int inRcpConst)
+static inline float rcpIEEEIntApproximation(float inX, const int inRcpConst)
 {
 	union _float_int x;
 	x.f = inX;
@ -216,7 +220,7 @@ float rcpIEEEIntApproximation(float inX, const int inRcpConst)
    return x.f;
 }

-float rcpNewtonRaphson(float inX, float inRcpX)
+static inline float rcpNewtonRaphson(float inX, float inRcpX)
 {
    return inRcpX * (-inRcpX * inX + 2.0f);
 }
@ -227,7 +231,7 @@ float rcpNewtonRaphson(float inX, float inRcpX)
 // Precise format : ~small float
 // 1 ALU
 //
-float fastRcpNR0(float inX)
+static inline float fastRcpNR0(float inX)
 {
    float  xRcp = rcpIEEEIntApproximation(inX, IEEE_INT_RCP_CONST_NR0);
    return xRcp;
@ -239,7 +243,7 @@ float fastRcpNR0(float inX)
 // Precise format : ~half float
 // 3 ALU
 //
-float fastRcpNR1(float inX)
+static inline float fastRcpNR1(float inX)
 {
    float  xRcp = rcpIEEEIntApproximation(inX, IEEE_INT_RCP_CONST_NR1);
    xRcp = rcpNewtonRaphson(inX, xRcp);
@ -252,7 +256,7 @@ float fastRcpNR1(float inX)
 // Precise format : ~full float
 // 5 ALU
 //
-float fastRcpNR2(float inX)
+static inline float fastRcpNR2(float inX)
 {
    float  xRcp = rcpIEEEIntApproximation(inX, IEEE_INT_RCP_CONST_NR2);
    xRcp = rcpNewtonRaphson(inX, xRcp);
@ -271,7 +275,7 @@ static const float fsl_HALF_PI = 0.5f * 3.1415926535897932384626433f;
 // 4 VGRP, 16 ALU Full Rate
 // 7 * 10^-5 radians precision
 // Reference : Handbook of Mathematical Functions (chapter : Elementary Transcendental Functions), M. Abramowitz and I.A. Stegun, Ed.
-float acosFast4(float inX)
+static inline float acosFast4(float inX)
 {
    float x1 = fabsf(inX);
    float x2 = x1 * x1;
@ -291,7 +295,7 @@ float acosFast4(float inX)
 // 4th order polynomial approximation
 // 4 VGRP, 16 ALU Full Rate
 // 7 * 10^-5 radians precision
-float asinFast4(float inX)
+static inline float asinFast4(float inX)
 {
    float x = inX;
    
@ -303,9 +307,13 @@ float asinFast4(float inX)
 // 4 VGRP, 12 ALU Full Rate
 // 7 * 10^-5 radians precision
 // Reference : Efficient approximations for the arctangent function, Rajan, S. Sichun Wang Inkol, R. Joyal, A., May 2006
-float atanFast4(float inX)
+static inline float atanFast4(float inX)
 {
    float  x = inX;
    return x*(-0.1784f * fabsf(x) - 0.0663f * x * x + 1.0301f);
 }
+
+#ifdef __cplusplus
+}
+#endif //cplusplus
 #endif //SHADER_FAST_MATH_INC_FX
--- a/NativePath/standard/stdlib.h
+++ b/NativePath/standard/stdlib.h
@ -3,6 +3,7 @@

 #include "../NativePath.h"
 #include "../NativeMemory.h"
+#include "stddef.h"

 #ifdef __cplusplus
 extern "C" {
--- a/source/NativeMath.c
+++ b/source/NativeMath.c
@ -1,193 +0,0 @@
-/*
-Copyright (c) 2015 Giovanni Petrantoni
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-//
-//  NativeMath.c
-//  NativePath
-//
-//  Created by Giovanni Petrantoni on 11/16/15.
-//  Copyright © 2015 Giovanni Petrantoni. All rights reserved.
-//
-
-#include <math.h>
-#include "ShaderFastMathLib.h"
-#include "lol_trig.h"
-
-//ShaderFastMathLib
-
-//
-// Using 0 Newton Raphson iterations
-// Relative error : ~3.4% over full
-// Precise format : ~small float
-// 2 ALU
-//
-float npFastRcpSqrtNR0(float inX)
-{
-    return fastRcpSqrtNR0(inX);
-}
-
-//
-// Using 1 Newton Raphson iterations
-// Relative error : ~0.2% over full
-// Precise format : ~half float
-// 6 ALU
-//
-float npFastRcpSqrtNR1(float inX)
-{
-    return fastRcpSqrtNR1(inX);
-}
-
-//
-// Using 2 Newton Raphson iterations
-// Relative error : ~4.6e-004%  over full
-// Precise format : ~full float
-// 9 ALU
-//
-float npFastRcpSqrtNR2(float inX)
-{
-    return fastRcpSqrtNR2(inX);
-}
-
-//
-// Using 0 Newton Raphson iterations
-// Relative error : < 0.7% over full
-// Precise format : ~small float
-// 1 ALU
-//
-float npFastSqrtNR0(float inX)
-{
-    return fastSqrtNR0(inX);
-}
-
-//
-// Use inverse Rcp Sqrt
-// Using 1 Newton Raphson iterations
-// Relative error : ~0.2% over full
-// Precise format : ~half float
-// 6 ALU
-//
-float npFastSqrtNR1(float inX)
-{
-    return fastSqrtNR1(inX);
-}
-
-//
-// Use inverse Rcp Sqrt
-// Using 2 Newton Raphson iterations
-// Relative error : ~4.6e-004%  over full
-// Precise format : ~full float
-// 9 ALU
-//
-float npFastSqrtNR2(float inX)
-{
-    return fastSqrtNR2(inX);
-}
-
-//
-// Using 0 Newton Raphson iterations
-// Relative error : < 0.4% over full
-// Precise format : ~small float
-// 1 ALU
-//
-float npFastRcpNR0(float inX)
-{
-    return fastRcpNR0(inX);
-}
-
-//
-// Using 1 Newton Raphson iterations
-// Relative error : < 0.02% over full
-// Precise format : ~half float
-// 3 ALU
-//
-float npFastRcpNR1(float inX)
-{
-    return fastRcpNR1(inX);
-}
-
-//
-// Using 2 Newton Raphson iterations
-// Relative error : < 5.0e-005%  over full
-// Precise format : ~full float
-// 5 ALU
-//
-float npFastRcpNR2(float inX)
-{
-    return fastRcpNR2(inX);
-}
-
-// 4th order polynomial approximation
-// 4 VGRP, 16 ALU Full Rate
-// 7 * 10^-5 radians precision
-// Reference : Handbook of Mathematical Functions (chapter : Elementary Transcendental Functions), M. Abramowitz and I.A. Stegun, Ed.
-float npAcosFast4(float inX)
-{
-    return acosFast4(inX);
-}
-
-// 4th order polynomial approximation
-// 4 VGRP, 16 ALU Full Rate
-// 7 * 10^-5 radians precision
-float npAsinFast4(float inX)
-{
-    return asinFast4(inX);
-}
-
-// 4th order hyperbolical approximation
-// 4 VGRP, 12 ALU Full Rate
-// 7 * 10^-5 radians precision
-// Reference : Efficient approximations for the arctangent function, Rajan, S. Sichun Wang Inkol, R. Joyal, A., May 2006
-float npAtanFast4(float inX)
-{
-    return atanFast4(inX);
-}
-
-double npLolFabs(double x)
-{
-    return _lol_fabs(x);
-}
-
-double npLolSin(double x)
-{
-    return _lol_sin(x);
-}
-
-double npLolCos(double x)
-{
-    return _lol_cos(x);
-}
-
-void npLolSincos(double x, double *sinx, double *cosx)
-{
-    _lol_sincos(x, sinx, cosx);
-}
-
-void npLolSincosf(float x, float *sinx, float *cosx)
-{
-    _lol_sincosf(x, sinx, cosx);
-}
-
-double npLolTan(double x)
-{
-    return _lol_tan(x);
-}
-
--- a/source/lol_trig.cpp
+++ b/source/lol_trig.cpp
@ -1,426 +0,0 @@
-//
-// Lol Engine
-//
-// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
-//   This program is free software; you can redistribute it and/or
-//   modify it under the terms of the Do What The Fuck You Want To
-//   Public License, Version 2, as published by Sam Hocevar. See
-//   http://www.wtfpl.net/ for more details.
-//
-
-//#include <lol/engine-internal.h>
-
-#include <cmath>
-
-static const double D_PI = 3.1415926535897932384626433f;
-
-#if defined HAVE_FASTMATH_H
-#   include <fastmath.h>
-#endif
-
-// Optimisation helpers
-#if defined __GNUC__
-#   define __likely(x)   __builtin_expect(!!(x), 1)
-#   define __unlikely(x) __builtin_expect(!!(x), 0)
-#   define INLINEATTR __attribute__((always_inline))
-#   if defined __x86_64__
-#      define FP_USE(x) __asm__("" : "+x" (x))
-#   elif defined __i386__ /* FIXME: this isn't good */
-#      define FP_USE(x) __asm__("" : "+m" (x))
-#   else
-#      define FP_USE(x) (void)(x)
-#   endif
-#else
-#   define __likely(x)   x
-#   define __unlikely(x) x
-#   define INLINEATTR
-#   define FP_USE(x) (void)(x)
-#endif
-
-namespace lol
-{
-    
-    static const double PI_2   = 1.57079632679489661923132;
-    static const double PI_4   = 0.785398163397448309615661;
-    static const double INV_PI = 0.318309886183790671537768;
-    static const double ROOT3  = 1.73205080756887729352745;
-    
-    static const double ZERO    = 0.0;
-    static const double ONE     = 1.0;
-    static const double NEG_ONE = -1.0;
-    static const double HALF    = 0.5;
-    static const double QUARTER = 0.25;
-    static const double TWO     = 2.0;
-#if defined __GNUC__
-    static const double VERY_SMALL_NUMBER = 0x1.0p-128;
-#else
-    static const double VERY_SMALL_NUMBER = 3e-39;
-#endif
-    static const double TWO_EXP_52 = 4503599627370496.0;
-    static const double TWO_EXP_54 = 18014398509481984.0;
-    
-    /** sin Taylor series coefficients. */
-    static const double SC[] =
-    {
-        -1.6449340668482264364724e-0, // π^2/3!
-        +8.1174242528335364363700e-1, // π^4/5!
-        -1.9075182412208421369647e-1, // π^6/7!
-        +2.6147847817654800504653e-2, // π^8/9!
-        -2.3460810354558236375089e-3, // π^10/11!
-        +1.4842879303107100368487e-4, // π^12/13!
-        -6.9758736616563804745344e-6, // π^14/15!
-        +2.5312174041370276513517e-7, // π^16/17!
-    };
-    
-    /* Note: the last value should be -1.3878952462213772114468e-7 (ie.
-     * π^18/18!) but we tweak it in order to get the better average precision
-     * required for tan() computations when close to π/2+kπ values. */
-    static const double CC[] =
-    {
-        -4.9348022005446793094172e-0, // π^2/2!
-        +4.0587121264167682181850e-0, // π^4/4!
-        -1.3352627688545894958753e-0, // π^6/6!
-        +2.3533063035889320454188e-1, // π^8/8!
-        -2.5806891390014060012598e-2, // π^10/10!
-        +1.9295743094039230479033e-3, // π^12/12!
-        -1.0463810492484570711802e-4, // π^14/14!
-        +4.3030695870329470072978e-6, // π^16/16!
-        -1.3777e-7,
-    };
-    
-    /* These coefficients use Sloane’s http://oeis.org/A002430 and
-     * http://oeis.org/A036279 sequences for the Taylor series of tan().
-     * Note: the last value should be 2.12485922978838540352881e5 (ie.
-     * 443861162*π^18/1856156927625), but we tweak it in order to get
-     * sub 1e-11 average precision in a larger range. */
-    static const double TC[] =
-    {
-        3.28986813369645287294483e0, // π^2/3
-        1.29878788045336582981920e1, // 2*π^4/15
-        5.18844961612069061254404e1, // 17*π^6/315
-        2.07509320280908496804928e2, // 62*π^8/2835
-        8.30024701695986756361561e2, // 1382*π^10/155925
-        3.32009324029001216460018e3, // 21844*π^12/6081075
-        1.32803704909665483598490e4, // 929569*π^14/638512875
-        5.31214808666037709352112e4, // 6404582*π^16/10854718875
-        2.373e5,
-    };
-    
-    static inline double lol_fabs(double x) INLINEATTR;
-#if defined __GNUC__
-    static inline double lol_round(double x) INLINEATTR;
-    static inline double lol_trunc(double x) INLINEATTR;
-#endif
-    
-    static inline double lol_fabs(double x)
-    {
-#if defined __GNUC__
-        return __builtin_fabs(x);
-#else
-        using std::fabs;
-        return fabs(x);
-#endif
-    }
-    
-#if defined __GNUC__
-    static inline double lol_round(double x)
-    {
-        return __builtin_round(x);
-    }
-    
-    static inline double lol_trunc(double x)
-    {
-        return __builtin_trunc(x);
-    }
-#endif
-    
-    double lol_sin(double x)
-    {
-        double absx = lol_fabs(x * INV_PI);
-        
-        /* If branches are cheap, skip the cycle count when |x| < π/4,
-         * and only do the Taylor series up to the required precision. */
-#if LOL_FEATURE_CHEAP_BRANCHES
-        if (absx < QUARTER)
-        {
-            /* Computing x^4 is one multiplication too many we do, but it helps
-             * interleave the Taylor series operations a lot better. */
-            double x2 = absx * absx;
-            double x4 = x2 * x2;
-            double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
-            double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
-            double taylor = sub2 * x2 + sub1;
-            return x * taylor;
-        }
-#endif
-        
-        /* Wrap |x| to the range [-1, 1] and keep track of the number of
-         * cycles required. If odd, we'll need to change the sign of the
-         * result. */
-        double num_cycles = absx + TWO_EXP_52;
-        FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
-        
-        double is_even = TWO * num_cycles - ONE;
-        FP_USE(is_even); is_even += TWO_EXP_54;
-        FP_USE(is_even); is_even -= TWO_EXP_54;
-        FP_USE(is_even);
-        is_even -= TWO * num_cycles - ONE;
-        double sign = is_even;
-        
-        absx -= num_cycles;
-        
-        /* If branches are very cheap, we have the option to do the Taylor
-         * series at a much lower degree by splitting. */
-#if LOL_FEATURE_VERY_CHEAP_BRANCHES
-        if (lol_fabs(absx) > QUARTER)
-        {
-            sign = (x * absx >= 0.0) ? sign : -sign;
-            
-            double x1 = HALF - lol_fabs(absx);
-            double x2 = x1 * x1;
-            double x4 = x2 * x2;
-            double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
-            double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
-            double taylor = sub2 * x2 + sub1;
-            
-            return taylor * sign;
-        }
-#endif
-        
-        sign *= (x >= 0.0) ? D_PI : -D_PI;
-        
-        /* Compute a Tailor series for sin() and combine sign information. */
-        double x2 = absx * absx;
-        double x4 = x2 * x2;
-#if LOL_FEATURE_VERY_CHEAP_BRANCHES
-        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
-        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
-#else
-        double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
-        double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
-#endif
-        double taylor = sub2 * x2 + sub1;
-        
-        return absx * taylor * sign;
-    }
-    
-    double lol_cos(double x)
-    {
-        double absx = lol_fabs(x * INV_PI);
-        
-#if LOL_FEATURE_CHEAP_BRANCHES
-        if (absx < QUARTER)
-        {
-            double x2 = absx * absx;
-            double x4 = x2 * x2;
-            double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
-            double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
-            double taylor = (sub1 * x2 + sub2) * x2 + ONE;
-            return taylor;
-        }
-#endif
-        
-        double num_cycles = absx + TWO_EXP_52;
-        FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
-        
-        double is_even = TWO * num_cycles - ONE;
-        FP_USE(is_even); is_even += TWO_EXP_54;
-        FP_USE(is_even); is_even -= TWO_EXP_54;
-        FP_USE(is_even);
-        is_even -= TWO * num_cycles - ONE;
-        double sign = is_even;
-        
-        absx -= num_cycles;
-        
-#if LOL_FEATURE_VERY_CHEAP_BRANCHES
-        if (lol_fabs(absx) > QUARTER)
-        {
-            double x1 = HALF - lol_fabs(absx);
-            double x2 = x1 * x1;
-            double x4 = x2 * x2;
-            double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
-            double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
-            double taylor = sub2 * x2 + sub1;
-            
-            return x1 * taylor * sign * D_PI;
-        }
-#endif
-        
-        double x2 = absx * absx;
-        double x4 = x2 * x2;
-#if LOL_FEATURE_VERY_CHEAP_BRANCHES
-        double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
-        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
-#else
-        double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
-        double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
-#endif
-        double taylor = sub2 * x2 + sub1;
-        
-        return taylor * sign;
-    }
-    
-    void lol_sincos(double x, double *sinx, double *cosx)
-    {
-        double absx = lol_fabs(x * INV_PI);
-        
-#if LOL_FEATURE_CHEAP_BRANCHES
-        if (absx < QUARTER)
-        {
-            double x2 = absx * absx;
-            double x4 = x2 * x2;
-            
-            /* Computing the Taylor series to the 11th order is enough to get
-             * x * 1e-11 precision, but we push it to the 13th order so that
-             * tan() has a better precision. */
-            double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
-            double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
-            double taylors = subs2 * x2 + subs1;
-            *sinx = x * taylors;
-            
-            double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
-            double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
-            double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
-            *cosx = taylorc;
-            
-            return;
-        }
-#endif
-        
-        double num_cycles = absx + TWO_EXP_52;
-        FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
-        
-        double is_even = TWO * num_cycles - ONE;
-        FP_USE(is_even); is_even += TWO_EXP_54;
-        FP_USE(is_even); is_even -= TWO_EXP_54;
-        FP_USE(is_even);
-        is_even -= TWO * num_cycles - ONE;
-        double sin_sign = is_even;
-        double cos_sign = is_even;
-        
-        absx -= num_cycles;
-        
-#if LOL_FEATURE_VERY_CHEAP_BRANCHES
-        if (lol_fabs(absx) > QUARTER)
-        {
-            cos_sign = sin_sign;
-            sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
-            
-            double x1 = HALF - lol_fabs(absx);
-            double x2 = x1 * x1;
-            double x4 = x2 * x2;
-            
-            double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
-            double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
-            double taylors = subs2 * x2 + subs1;
-            *sinx = taylors * sin_sign;
-            
-            double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
-            double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
-            double taylorc = subc2 * x2 + subc1;
-            *cosx = x1 * taylorc * cos_sign * D_PI;
-            
-            return;
-        }
-#endif
-        
-        sin_sign *= (x >= 0.0) ? D_PI : -D_PI;
-        
-        double x2 = absx * absx;
-        double x4 = x2 * x2;
-#if LOL_FEATURE_VERY_CHEAP_BRANCHES
-        double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
-        double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
-        double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
-        double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
-#else
-        double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
-        double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
-        /* Push Taylor series to the 19th order to enhance tan() accuracy. */
-        double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
-        double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
-#endif
-        double taylors = subs2 * x2 + subs1;
-        *sinx = absx * taylors * sin_sign;
-        
-        double taylorc = subc2 * x2 + subc1;
-        *cosx = taylorc * cos_sign;
-    }
-    
-    void lol_sincos(float x, float *sinx, float *cosx)
-    {
-        double x2 = static_cast<double>(x);
-        double s2, c2;
-        lol_sincos(x2, &s2, &c2);
-        *sinx = static_cast<float>(s2);
-        *cosx = static_cast<float>(c2);
-    }
-    
-    double lol_tan(double x)
-    {
-#if LOL_FEATURE_CHEAP_BRANCHES
-        double absx = lol_fabs(x * INV_PI);
-        
-        /* This value was determined empirically to ensure an error of no
-         * more than x * 1e-11 in this range. */
-        if (absx < 0.163)
-        {
-            double x2 = absx * absx;
-            double x4 = x2 * x2;
-            double sub1 = (((TC[7] * x4 + TC[5]) * x4
-                            + TC[3]) * x4 + TC[1]) * x4 + ONE;
-            double sub2 = (((TC[8] * x4 + TC[6]) * x4
-                            + TC[4]) * x4 + TC[2]) * x4 + TC[0];
-            double taylor = sub2 * x2 + sub1;
-            return x * taylor;
-        }
-#endif
-        
-        double sinx, cosx;
-        lol_sincos(x, &sinx, &cosx);
-        
-        /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
-        double absc = lol_fabs(cosx);
-        
-        if (__unlikely(absc < VERY_SMALL_NUMBER))
-            cosx = VERY_SMALL_NUMBER;
-        return sinx / cosx;
-    }
-    
-} /* namespace lol */
-
-#ifdef __cplusplus
-extern "C"
-{
-    double _lol_fabs(double x)
-    {
-        return lol::lol_fabs(x);
-    }
-    
-    double _lol_sin(double x)
-    {
-        return lol::lol_sin(x);
-    }
-    
-    double _lol_cos(double x)
-    {
-        return lol::lol_cos(x);
-    }
-    
-    void _lol_sincos(double x, double *sinx, double *cosx)
-    {
-        lol::lol_sincos(x, sinx, cosx);
-    }
-    
-    void _lol_sincosf(float x, float *sinx, float *cosx)
-    {
-        lol::lol_sincos(x, sinx, cosx);
-    }
-    
-    double _lol_tan(double x)
-    {
-        return lol::lol_tan(x);
-    }
-}
-#endif
-
--- a/source/lol_trig.h
+++ b/source/lol_trig.h
@ -1,21 +0,0 @@
-//
-//  lol_trig.h
-//  NativePath
-//
-//  Created by Void on 11/17/15.
-//  Copyright © 2015 Voidtarget. All rights reserved.
-//
-
-#ifndef lol_trig_h
-#define lol_trig_h
-
-double _lol_fabs(double x);
-double _lol_round(double x);
-double _lol_trunc(double x);
-double _lol_sin(double x);
-double _lol_cos(double x);
-void _lol_sincos(double x, double *sinx, double *cosx);
-void _lol_sincosf(float x, float *sinx, float *cosx);
-double _lol_tan(double x);
-
-#endif /* lol_trig_h */
--- a/tools/np-build.lua
+++ b/tools/np-build.lua
@ -17,7 +17,7 @@ function BuildWindows32DLL(cfile, isCpp)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
 	if isCpp then flags = flags.." -std=c++1z " end
-	local cmd = "clang -v -m32 -DNP_WIN32 -Wall -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -v -m32 -DNP_WIN32 -Wall -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -38,14 +38,19 @@ end

 --LLVM bytecode

-function BuildLLVM32(cfile)
+function BuildLLVM32(cfile, isCpp)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -m32 -nostdlibinc -nobuiltininc -nostdinc++ -fno-exceptions "..common_flags.." "..flags.." -o "..cfile..".ll ".." -S -c -emit-llvm -target i386-unknown "..cfile;
+	if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
+	local cmd = "clang -DNP_LLVM_BC -m32 -nostdlibinc -nobuiltininc -nostdinc++ "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target i386-unknown "..cfile
+	local cmdLL = "clang -DNP_LLVM_BC -m32 -nostdlibinc -nobuiltininc -nostdinc++ "..common_flags.." "..flags.." -o "..cfile..".ll ".." -S -c -emit-llvm -target i386-unknown "..cfile
+	local cmdPP = "clang -DNP_LLVM_BC -m32 -nostdlibinc -nobuiltininc -nostdinc++ "..common_flags.." "..flags.." -E "..cfile.." > "..cfile..".pp"
 	if is_verbose == true then
 		print(cmd)
 	end
 	if os.execute(cmd) == 0 then table.insert(objs, cfile..".bc") end
+	os.execute(cmdLL)
+	os.execute(cmdPP)
 end

 function LinkLLVM32()
@ -53,17 +58,110 @@ function LinkLLVM32()
 	for i, o in ipairs(objs) do
 		objs_str = objs_str..o.." "
 	end
-	local cmd = "llvm-link -o LLVM32\\"..outputName..".bc "..objs_str
+	local cmd = "llvm-link -o LLVM\\"..outputName.."-i386.bc "..objs_str
 	if is_verbose == true then
 		print(cmd)
 	end
 	os.execute(cmd)
 end

-function BuildLLVM64(cfile)
+function BuildLLVMarmv7(cfile, isCpp)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -m64 -nostdlibinc -nobuiltininc -nostdinc++ -fno-exceptions "..common_flags.." "..flags.." -o "..cfile..".ll ".." -S -c -emit-llvm -target x86_64-unknown "..cfile;
+	if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
+	local cmd = "clang -DNP_LLVM_BC -nostdlibinc -nobuiltininc -nostdinc++ -mfpu=neon -mfloat-abi=hard "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target armv7-unknown "..cfile
+	if is_verbose == true then
+		print(cmd)
+	end
+	if os.execute(cmd) == 0 then table.insert(objs, cfile..".bc") end
+end
+
+function LinkLLVMarmv7()
+	local objs_str = ""
+	for i, o in ipairs(objs) do
+		objs_str = objs_str..o.." "
+	end
+	local cmd = "llvm-link -o LLVM\\"..outputName.."-armv7.bc "..objs_str
+	if is_verbose == true then
+		print(cmd)
+	end
+	os.execute(cmd)
+end
+
+function BuildLLVMarmv7s(cfile, isCpp)
+	local flags = ""
+	if debug then flags = debug_flags else flags = release_flags end
+	if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
+	local cmd = "clang -DNP_LLVM_BC -nostdlibinc -nobuiltininc -nostdinc++ -mfpu=neon -mfloat-abi=hard "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target armv7s-unknown "..cfile
+	if is_verbose == true then
+		print(cmd)
+	end
+	if os.execute(cmd) == 0 then table.insert(objs, cfile..".bc") end
+end
+
+function LinkLLVMarmv7s()
+	local objs_str = ""
+	for i, o in ipairs(objs) do
+		objs_str = objs_str..o.." "
+	end
+	local cmd = "llvm-link -o LLVM\\"..outputName.."-armv7s.bc "..objs_str
+	if is_verbose == true then
+		print(cmd)
+	end
+	os.execute(cmd)
+end
+
+function BuildLLVMAArch64(cfile, isCpp)
+	local flags = ""
+	if debug then flags = debug_flags else flags = release_flags end
+	if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
+	local cmd = "clang -DNP_LLVM_BC -nostdlibinc -nobuiltininc -nostdinc++ "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target aarch64-unknown "..cfile
+	if is_verbose == true then
+		print(cmd)
+	end
+	if os.execute(cmd) == 0 then table.insert(objs, cfile..".bc") end
+end
+
+function LinkLLVMAArch64()
+	local objs_str = ""
+	for i, o in ipairs(objs) do
+		objs_str = objs_str..o.." "
+	end
+	local cmd = "llvm-link -o LLVM\\"..outputName.."-aarch64.bc "..objs_str
+	if is_verbose == true then
+		print(cmd)
+	end
+	os.execute(cmd)
+end
+
+function BuildLLVMarmv6(cfile, isCpp)
+	local flags = ""
+	if debug then flags = debug_flags else flags = release_flags end
+	if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
+	local cmd = "clang -DNP_LLVM_BC -nostdlibinc -nobuiltininc -nostdinc++ -mfloat-abi=hard -mfpu=vfp "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target armv6-unknown "..cfile
+	if is_verbose == true then
+		print(cmd)
+	end
+	if os.execute(cmd) == 0 then table.insert(objs, cfile..".bc") end
+end
+
+function LinkLLVMarmv6()
+	local objs_str = ""
+	for i, o in ipairs(objs) do
+		objs_str = objs_str..o.." "
+	end
+	local cmd = "llvm-link -o LLVM\\"..outputName.."-armv6.bc "..objs_str
+	if is_verbose == true then
+		print(cmd)
+	end
+	os.execute(cmd)
+end
+
+function BuildLLVM64(cfile, isCpp)
+	local flags = ""
+	if debug then flags = debug_flags else flags = release_flags end
+	if isCpp then flags = flags.." -std=c++1z -fno-rtti -fno-exceptions" end
+	local cmd = "clang -DNP_LLVM_BC -m64 -nostdlibinc -nobuiltininc -nostdinc++ "..common_flags.." "..flags.." -o "..cfile..".bc ".." -c -emit-llvm -target x86_64-unknown "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -75,7 +173,7 @@ function LinkLLVM64()
 	for i, o in ipairs(objs) do
 		objs_str = objs_str..o.." "
 	end
-	local cmd = "llvm-link -o LLVM64\\"..outputName..".bc "..objs_str
+	local cmd = "llvm-link -o LLVM\\"..outputName.."-x86_64.bc "..objs_str
 	if is_verbose == true then
 		print(cmd)
 	end
@ -87,7 +185,7 @@ end
 function BuildWindows32(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -m32 -DNP_WIN32 -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -m32 -DNP_WIN32 -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -109,7 +207,7 @@ end
 function BuildWindows64(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -m64 -DNP_WIN32 -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -m64 -DNP_WIN32 -gcodeview -fno-ms-extensions -nostdlibinc -nobuiltininc -nostdinc++ -target i686-pc-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -155,7 +253,7 @@ end
 function BuildWindowsUWP64(cfile)
 	local flags = ""
 	if debug then flags = debug_ms_flags else flags = release_ms_flags end
-	local cmd = "clang-cl -DNP_WIN32 -WX -EHsc -GS- -MD -DWIN_EXPORT -m64 "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang-cl -DNP_WIN32 -WX -EHsc -GS- -MD -DWIN_EXPORT -m64 "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -177,7 +275,7 @@ end
 function BuildWindowsUWPARM(cfile)
 	local flags = ""
 	if debug then flags = debug_ms_flags else flags = release_ms_flags end
-	local cmd = "clang-cl -DNP_WIN32 -WX -EHsc -GS- -MD -DWIN_EXPORT -m32 --target=thumbv7-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang-cl -DNP_WIN32 -WX -EHsc -GS- -MD -DWIN_EXPORT -m32 --target=thumbv7-windows-msvc "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -266,7 +364,7 @@ function BuildIOSArm7(cfile, isCpp)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
 	if isCpp then flags = flags.." -std=c++1z " end
-	local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target armv7-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target armv7-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -291,7 +389,7 @@ end
 function BuildIOSArm7s(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target armv7s-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target armv7s-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -313,7 +411,7 @@ end
 function BuildIOSArm64(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target arm64-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target arm64-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -335,7 +433,7 @@ end
 function BuildIOSx86(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target i386-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target i386-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -357,7 +455,7 @@ end
 function BuildIOSx64(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target x86_64-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_IOS -nostdlibinc -nobuiltininc -nostdinc++ -mios-version-min=6.0 -target x86_64-apple-ios "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -381,7 +479,7 @@ end
 function BuildMacOSx86(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_MACOS -nostdlibinc -nobuiltininc -nostdinc++ -mmacosx-version-min=10.5 -target i386-apple-macosx "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_MACOS -nostdlibinc -nobuiltininc -nostdinc++ -mmacosx-version-min=10.5 -target i386-apple-macosx "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -403,7 +501,7 @@ end
 function BuildMacOSx64(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_MACOS -nostdlibinc -nobuiltininc -nostdinc++ -mmacosx-version-min=10.5 -target x86_64-apple-macosx "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_MACOS -nostdlibinc -nobuiltininc -nostdinc++ -mmacosx-version-min=10.5 -target x86_64-apple-macosx "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -427,7 +525,7 @@ end
 function BuildAndroidArm(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target arm-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target arm-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -449,7 +547,7 @@ end
 function BuildAndroidArm7(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target armv7-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target armv7-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -471,7 +569,7 @@ end
 function BuildAndroidArm64(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target aarch64-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target aarch64-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -493,7 +591,7 @@ end
 function BuildAndroidx86(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target i386-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target i386-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -518,7 +616,7 @@ end
 function BuildAndroidx64(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target x86_64-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_ANDROID -nostdlibinc -nobuiltininc -nostdinc++ -target x86_64-none-android "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -542,7 +640,7 @@ end
 function BuildLinuxX64(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_LINUX -nostdlibinc -nobuiltininc -nostdinc++ -fPIC -target x86_64-linux-gnu "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_LINUX -nostdlibinc -nobuiltininc -nostdinc++ -fPIC -target x86_64-linux-gnu "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -564,7 +662,7 @@ end
 function BuildLinuxX86(cfile)
 	local flags = ""
 	if debug then flags = debug_flags else flags = release_flags end
-	local cmd = "clang -DNP_LINUX -nostdlibinc -nobuiltininc -nostdinc++ -fPIC -target i386-linux-gnu "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile;
+	local cmd = "clang -DNP_LINUX -nostdlibinc -nobuiltininc -nostdinc++ -fPIC -target i386-linux-gnu "..common_flags.." "..flags.." -o "..cfile..".o ".." -c "..cfile
 	if is_verbose == true then
 		print(cmd)
 	end
@ -767,23 +865,67 @@ elseif platform == "macos" then
 	os.remove("macOS\\"..outputName.."_x86_64.a")
 	
 elseif platform == "llvm" then
-	lfs.mkdir("LLVM32")
+	lfs.mkdir("LLVM")
 	
 	objs = {}
    print ("Building LLVM x86...")
 	for i,f in ipairs(cfiles) do
-		BuildLLVM32(f)
+		BuildLLVM32(f, false)
+	end
+	for i,f in ipairs(cppfiles) do
+		BuildLLVM32(f, true)
 	end
 	LinkLLVM32()
 	
-	lfs.mkdir("LLVM64")
-	
 	objs = {}
    print ("Building LLVM x64...")
 	for i,f in ipairs(cfiles) do
-		BuildLLVM64(f)
+		BuildLLVM64(f, false)
+	end
+	for i,f in ipairs(cppfiles) do
+		BuildLLVM64(f, true)
 	end
 	LinkLLVM64()
+	
+	objs = {}
+    print ("Building LLVM armv6...")
+	for i,f in ipairs(cfiles) do
+		BuildLLVMarmv6(f, false)
+	end
+	for i,f in ipairs(cppfiles) do
+		BuildLLVMarmv6(f, true)
+	end
+	LinkLLVMarmv6()
+	
+	objs = {}
+    print ("Building LLVM armv7...")
+	for i,f in ipairs(cfiles) do
+		BuildLLVMarmv7(f, false)
+	end
+	for i,f in ipairs(cppfiles) do
+		BuildLLVMarmv7(f, true)
+	end
+	LinkLLVMarmv7()
+	
+	objs = {}
+    print ("Building LLVM armv7s...")
+	for i,f in ipairs(cfiles) do
+		BuildLLVMarmv7s(f, false)
+	end
+	for i,f in ipairs(cppfiles) do
+		BuildLLVMarmv7s(f, true)
+	end
+	LinkLLVMarmv7s()
+	
+	objs = {}
+    print ("Building LLVM AArch64...")
+	for i,f in ipairs(cfiles) do
+		BuildLLVMAArch64(f, false)
+	end
+	for i,f in ipairs(cppfiles) do
+		BuildLLVMAArch64(f, true)
+	end
+	LinkLLVMAArch64()

 elseif platform == "linux" then
 	lfs.mkdir("Linux")