enabling cross-compile of the cntk sources for arm64 devices.

This commit replaces implementations of functions based on sse with a non-sse version and updates the makefile such that SSE_FLAGS can be overriden when compiling for non-Intel architectures.
2016-10-14 10:09:40 +02:00 · 2016-10-14 10:09:40 +02:00 · a183815dd1
--- a/9
+++ b/9
@ -67,13 +67,18 @@ endif
 # The mpic++ wrapper only adds MPI specific flags to the g++ command line.
 # The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
 CXX = mpic++
+SSE_FLAGS = -msse4.1 -mssse3
+
+# Settings for ARM64 architectures that use a crosscompiler on a host machine.
+#CXX = aarch64-linux-gnu-g++
+#SSE_FLAGS =

 SOURCEDIR:= Source
 INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib)
 # COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
 COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
-CPPFLAGS:= 
-CXXFLAGS:= -msse4.1 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
+CPPFLAGS:=
+CXXFLAGS:= $(SSE_FLAGS) -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
 LIBPATH:=
 LIBS:=
 LDFLAGS:=
--- a/Source/Common/Include/ssefloat4.h
+++ b/Source/Common/Include/ssefloat4.h
@ -11,7 +11,12 @@
 #include <intrin.h> // for intrinsics
 #endif
 #ifdef __unix__
+#if !defined(__aarch64__)
 #include <x86intrin.h>
+#else
+#define _mm_free(p) free(p)
+#define _mm_malloc(a, b) malloc(a)
+#endif
 #endif

 namespace msra { namespace math {
@ -27,6 +32,155 @@ namespace msra { namespace math {
 // newer ones: (seems no single list available)
 // ===========================================================================

+// The code in this file implements a float4 vector based on the SSE intrinsics available on Intel platforms.
+// Since we don't have SSE on ARM64 (NEON has similar functionality but is not identical) we cannot
+// use the SSE implementation on ARM64.
+// TODO: In the future, we should provide a NEON based implementation instead.
+#if defined(__aarch64__)
+typedef struct __m128_t
+{
+    float f[4];
+}__m128;
+
+static __m128 ZERO_M128 = {0,0,0,0};
+
+static __m128 _mm_setzero_ps()
+{
+    return ZERO_M128;
+}
+static void _mm_store_ss(float *a, const __m128 &b)
+{
+    *a = b.f[0];
+}
+static __m128 _mm_load1_ps(const float *a)
+{
+    __m128 result = {(float)*a, (float)*a, (float)*a, (float)*a};
+    return result;
+}
+static __m128 _mm_sub_ps(const __m128 &a, const __m128 &b)
+{
+    __m128 result =  {
+      a.f[0] - b.f[0],
+      a.f[1] - b.f[1],
+      a.f[2] - b.f[2],
+      a.f[3] - b.f[3] };
+
+    return result;
+}
+static __m128 _mm_and_ps(const __m128 &a, const __m128 &b)
+{
+    __m128 result =  {
+      (float)((int)(a.f[0]) & (int)(b.f[0])),
+      (float)((int)(a.f[1]) & (int)(b.f[1])),
+      (float)((int)(a.f[2]) & (int)(b.f[2])),
+      (float)((int)(a.f[3]) & (int)(b.f[3])) };
+
+    return result;
+}
+static __m128 _mm_or_ps(const __m128 &a, const __m128 &b)
+{
+    __m128 result =  {
+      (float)((int)(a.f[0]) | (int)(b.f[0])),
+      (float)((int)(a.f[1]) | (int)(b.f[1])),
+      (float)((int)(a.f[2]) | (int)(b.f[2])),
+      (float)((int)(a.f[3]) | (int)(b.f[3])) };
+
+    return result;
+}
+static __m128 _mm_add_ps(const __m128 &a, const __m128 &b)
+{
+    __m128 result =  {
+      a.f[0] + b.f[0],
+      a.f[1] + b.f[1],
+      a.f[2] + b.f[2],
+      a.f[3] + b.f[3] };
+
+    return result;
+}
+static __m128 _mm_mul_ps(const __m128 &a, const __m128 &b)
+{
+    __m128 result =  {
+      a.f[0] * b.f[0],
+      a.f[1] * b.f[1],
+      a.f[2] * b.f[2],
+      a.f[3] * b.f[3] };
+
+    return result;
+}
+static __m128 _mm_div_ps(const __m128 &a, const __m128 &b)
+{
+    __m128 result =  {
+      a.f[0] / b.f[0],
+      a.f[1] / b.f[1],
+      a.f[2] / b.f[2],
+      a.f[3] / b.f[3] };
+
+    return result;
+}
+static __m128 _mm_hadd_ps(const __m128 &a, const __m128 &b)
+{
+    __m128 result =  {
+      a.f[0] + a.f[1],
+      a.f[2] + a.f[3],
+      b.f[0] + b.f[1],
+      b.f[2] + b.f[3] };
+
+    return result;
+}
+static __m128 _mm_cmpge_ps(const __m128 &a, const __m128 &b)
+{
+    __m128 result =  {
+      a.f[0] >= b.f[0] ? 1.0f : 0.0f,
+      a.f[1] >= b.f[1] ? 1.0f : 0.0f,
+      a.f[2] >= b.f[2] ? 1.0f : 0.0f,
+      a.f[3] >= b.f[3] ? 1.0f : 0.0f };
+
+    return result;
+}
+static __m128 _mm_cmple_ps(const __m128 &a, const __m128 &b)
+{
+    __m128 result =  {
+      a.f[0] <= b.f[0] ? 1.0f : 0.0f,
+      a.f[1] <= b.f[1] ? 1.0f : 0.0f,
+      a.f[2] <= b.f[2] ? 1.0f : 0.0f,
+      a.f[3] <= b.f[3] ? 1.0f : 0.0f };
+
+    return result;
+}
+
+#define _MM_TRANSPOSE4_PS( c1, c2, c3, c4 ) \
+{ \
+    float4 t1, t2, t3, t4; \
+ \
+    t1.v.f[0] = c1.v.f[0]; \
+    t1.v.f[1] = c2.v.f[0]; \
+    t1.v.f[2] = c3.v.f[0]; \
+    t1.v.f[3] = c4.v.f[0]; \
+ \
+    t2.v.f[0] = c1.v.f[1]; \
+    t2.v.f[1] = c2.v.f[1]; \
+    t2.v.f[2] = c3.v.f[1]; \
+    t2.v.f[3] = c4.v.f[1]; \
+ \
+    t3.v.f[0] = c1.v.f[2]; \
+    t3.v.f[1] = c2.v.f[2]; \
+    t3.v.f[2] = c3.v.f[2]; \
+    t3.v.f[3] = c4.v.f[2]; \
+ \
+    t4.v.f[0] = c1.v.f[3]; \
+    t4.v.f[1] = c2.v.f[3]; \
+    t4.v.f[2] = c3.v.f[3]; \
+    t4.v.f[3] = c4.v.f[3]; \
+ \
+    c1 = t1; \
+    c2 = t2; \
+    c3 = t3; \
+    c4 = t4; \
+}
+
+#define _mm_prefetch(a, b) 
+#endif
+
 class float4
 {
    __m128 v; // value
--- a/Source/Math/BlockHandlerSSE.cpp
+++ b/Source/Math/BlockHandlerSSE.cpp
@ -3,6 +3,14 @@
 // Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
 //
 #include "stdafx.h"
+
+// This class implements a block handler based on the SSE intrinsics available on intel platforms.
+// Since we don't have SSE on ARM64 (NEON has similar functionality but is not identical) we cannot
+// use the BlockHandlerSSE implementation no ARM64.
+// Therefore, exclude the implementation on ARM64 builds for now.
+// TODO: In the future, we should provide a NEON based implementation instead.
+#if !defined(__aarch64__)
+
 #include <xmmintrin.h>
 #include <emmintrin.h>
 #include <tmmintrin.h>
@ -30,3 +38,5 @@ int BlockHandlerSSE::RowToColOffsetRewrittenB(int col, int kOffset, int blockSiz


 }}}
+
+#endif