enabling cross-compile of the cntk sources for arm64 devices.

This commit replaces implementations of functions based on sse with a non-sse version
and updates the makefile such that SSE_FLAGS can be overriden when compiling for non-Intel architectures.
This commit is contained in:
Friedel van Megen 2016-10-14 10:09:40 +02:00
Родитель 279b25680c
Коммит a183815dd1
3 изменённых файлов: 171 добавлений и 2 удалений

Просмотреть файл

@ -67,13 +67,18 @@ endif
# The mpic++ wrapper only adds MPI specific flags to the g++ command line.
# The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
CXX = mpic++
SSE_FLAGS = -msse4.1 -mssse3
# Settings for ARM64 architectures that use a crosscompiler on a host machine.
#CXX = aarch64-linux-gnu-g++
#SSE_FLAGS =
SOURCEDIR:= Source
INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib)
# COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
CPPFLAGS:=
CXXFLAGS:= -msse4.1 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
CPPFLAGS:=
CXXFLAGS:= $(SSE_FLAGS) -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
LIBPATH:=
LIBS:=
LDFLAGS:=

Просмотреть файл

@ -11,7 +11,12 @@
#include <intrin.h> // for intrinsics
#endif
#ifdef __unix__
#if !defined(__aarch64__)
#include <x86intrin.h>
#else
#define _mm_free(p) free(p)
#define _mm_malloc(a, b) malloc(a)
#endif
#endif
namespace msra { namespace math {
@ -27,6 +32,155 @@ namespace msra { namespace math {
// newer ones: (seems no single list available)
// ===========================================================================
// The code in this file implements a float4 vector based on the SSE intrinsics available on Intel platforms.
// Since we don't have SSE on ARM64 (NEON has similar functionality but is not identical) we cannot
// use the SSE implementation on ARM64.
// TODO: In the future, we should provide a NEON based implementation instead.
#if defined(__aarch64__)
typedef struct __m128_t
{
float f[4];
}__m128;
static __m128 ZERO_M128 = {0,0,0,0};
static __m128 _mm_setzero_ps()
{
return ZERO_M128;
}
static void _mm_store_ss(float *a, const __m128 &b)
{
*a = b.f[0];
}
static __m128 _mm_load1_ps(const float *a)
{
__m128 result = {(float)*a, (float)*a, (float)*a, (float)*a};
return result;
}
static __m128 _mm_sub_ps(const __m128 &a, const __m128 &b)
{
__m128 result = {
a.f[0] - b.f[0],
a.f[1] - b.f[1],
a.f[2] - b.f[2],
a.f[3] - b.f[3] };
return result;
}
static __m128 _mm_and_ps(const __m128 &a, const __m128 &b)
{
__m128 result = {
(float)((int)(a.f[0]) & (int)(b.f[0])),
(float)((int)(a.f[1]) & (int)(b.f[1])),
(float)((int)(a.f[2]) & (int)(b.f[2])),
(float)((int)(a.f[3]) & (int)(b.f[3])) };
return result;
}
static __m128 _mm_or_ps(const __m128 &a, const __m128 &b)
{
__m128 result = {
(float)((int)(a.f[0]) | (int)(b.f[0])),
(float)((int)(a.f[1]) | (int)(b.f[1])),
(float)((int)(a.f[2]) | (int)(b.f[2])),
(float)((int)(a.f[3]) | (int)(b.f[3])) };
return result;
}
static __m128 _mm_add_ps(const __m128 &a, const __m128 &b)
{
__m128 result = {
a.f[0] + b.f[0],
a.f[1] + b.f[1],
a.f[2] + b.f[2],
a.f[3] + b.f[3] };
return result;
}
static __m128 _mm_mul_ps(const __m128 &a, const __m128 &b)
{
__m128 result = {
a.f[0] * b.f[0],
a.f[1] * b.f[1],
a.f[2] * b.f[2],
a.f[3] * b.f[3] };
return result;
}
static __m128 _mm_div_ps(const __m128 &a, const __m128 &b)
{
__m128 result = {
a.f[0] / b.f[0],
a.f[1] / b.f[1],
a.f[2] / b.f[2],
a.f[3] / b.f[3] };
return result;
}
static __m128 _mm_hadd_ps(const __m128 &a, const __m128 &b)
{
__m128 result = {
a.f[0] + a.f[1],
a.f[2] + a.f[3],
b.f[0] + b.f[1],
b.f[2] + b.f[3] };
return result;
}
static __m128 _mm_cmpge_ps(const __m128 &a, const __m128 &b)
{
__m128 result = {
a.f[0] >= b.f[0] ? 1.0f : 0.0f,
a.f[1] >= b.f[1] ? 1.0f : 0.0f,
a.f[2] >= b.f[2] ? 1.0f : 0.0f,
a.f[3] >= b.f[3] ? 1.0f : 0.0f };
return result;
}
static __m128 _mm_cmple_ps(const __m128 &a, const __m128 &b)
{
__m128 result = {
a.f[0] <= b.f[0] ? 1.0f : 0.0f,
a.f[1] <= b.f[1] ? 1.0f : 0.0f,
a.f[2] <= b.f[2] ? 1.0f : 0.0f,
a.f[3] <= b.f[3] ? 1.0f : 0.0f };
return result;
}
#define _MM_TRANSPOSE4_PS( c1, c2, c3, c4 ) \
{ \
float4 t1, t2, t3, t4; \
\
t1.v.f[0] = c1.v.f[0]; \
t1.v.f[1] = c2.v.f[0]; \
t1.v.f[2] = c3.v.f[0]; \
t1.v.f[3] = c4.v.f[0]; \
\
t2.v.f[0] = c1.v.f[1]; \
t2.v.f[1] = c2.v.f[1]; \
t2.v.f[2] = c3.v.f[1]; \
t2.v.f[3] = c4.v.f[1]; \
\
t3.v.f[0] = c1.v.f[2]; \
t3.v.f[1] = c2.v.f[2]; \
t3.v.f[2] = c3.v.f[2]; \
t3.v.f[3] = c4.v.f[2]; \
\
t4.v.f[0] = c1.v.f[3]; \
t4.v.f[1] = c2.v.f[3]; \
t4.v.f[2] = c3.v.f[3]; \
t4.v.f[3] = c4.v.f[3]; \
\
c1 = t1; \
c2 = t2; \
c3 = t3; \
c4 = t4; \
}
#define _mm_prefetch(a, b)
#endif
class float4
{
__m128 v; // value

Просмотреть файл

@ -3,6 +3,14 @@
// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
//
#include "stdafx.h"
// This class implements a block handler based on the SSE intrinsics available on intel platforms.
// Since we don't have SSE on ARM64 (NEON has similar functionality but is not identical) we cannot
// use the BlockHandlerSSE implementation no ARM64.
// Therefore, exclude the implementation on ARM64 builds for now.
// TODO: In the future, we should provide a NEON based implementation instead.
#if !defined(__aarch64__)
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
@ -30,3 +38,5 @@ int BlockHandlerSSE::RowToColOffsetRewrittenB(int col, int kOffset, int blockSiz
}}}
#endif