enabling cross-compile of the cntk sources for arm64 devices.
This commit replaces implementations of functions based on sse with a non-sse version and updates the makefile such that SSE_FLAGS can be overriden when compiling for non-Intel architectures.
This commit is contained in:
Родитель
279b25680c
Коммит
a183815dd1
9
Makefile
9
Makefile
|
@ -67,13 +67,18 @@ endif
|
|||
# The mpic++ wrapper only adds MPI specific flags to the g++ command line.
|
||||
# The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
|
||||
CXX = mpic++
|
||||
SSE_FLAGS = -msse4.1 -mssse3
|
||||
|
||||
# Settings for ARM64 architectures that use a crosscompiler on a host machine.
|
||||
#CXX = aarch64-linux-gnu-g++
|
||||
#SSE_FLAGS =
|
||||
|
||||
SOURCEDIR:= Source
|
||||
INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib)
|
||||
# COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
|
||||
COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
|
||||
CPPFLAGS:=
|
||||
CXXFLAGS:= -msse4.1 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
|
||||
CPPFLAGS:=
|
||||
CXXFLAGS:= $(SSE_FLAGS) -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
|
||||
LIBPATH:=
|
||||
LIBS:=
|
||||
LDFLAGS:=
|
||||
|
|
|
@ -11,7 +11,12 @@
|
|||
#include <intrin.h> // for intrinsics
|
||||
#endif
|
||||
#ifdef __unix__
|
||||
#if !defined(__aarch64__)
|
||||
#include <x86intrin.h>
|
||||
#else
|
||||
#define _mm_free(p) free(p)
|
||||
#define _mm_malloc(a, b) malloc(a)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace msra { namespace math {
|
||||
|
@ -27,6 +32,155 @@ namespace msra { namespace math {
|
|||
// newer ones: (seems no single list available)
|
||||
// ===========================================================================
|
||||
|
||||
// The code in this file implements a float4 vector based on the SSE intrinsics available on Intel platforms.
|
||||
// Since we don't have SSE on ARM64 (NEON has similar functionality but is not identical) we cannot
|
||||
// use the SSE implementation on ARM64.
|
||||
// TODO: In the future, we should provide a NEON based implementation instead.
|
||||
#if defined(__aarch64__)
|
||||
typedef struct __m128_t
|
||||
{
|
||||
float f[4];
|
||||
}__m128;
|
||||
|
||||
static __m128 ZERO_M128 = {0,0,0,0};
|
||||
|
||||
static __m128 _mm_setzero_ps()
|
||||
{
|
||||
return ZERO_M128;
|
||||
}
|
||||
static void _mm_store_ss(float *a, const __m128 &b)
|
||||
{
|
||||
*a = b.f[0];
|
||||
}
|
||||
static __m128 _mm_load1_ps(const float *a)
|
||||
{
|
||||
__m128 result = {(float)*a, (float)*a, (float)*a, (float)*a};
|
||||
return result;
|
||||
}
|
||||
static __m128 _mm_sub_ps(const __m128 &a, const __m128 &b)
|
||||
{
|
||||
__m128 result = {
|
||||
a.f[0] - b.f[0],
|
||||
a.f[1] - b.f[1],
|
||||
a.f[2] - b.f[2],
|
||||
a.f[3] - b.f[3] };
|
||||
|
||||
return result;
|
||||
}
|
||||
static __m128 _mm_and_ps(const __m128 &a, const __m128 &b)
|
||||
{
|
||||
__m128 result = {
|
||||
(float)((int)(a.f[0]) & (int)(b.f[0])),
|
||||
(float)((int)(a.f[1]) & (int)(b.f[1])),
|
||||
(float)((int)(a.f[2]) & (int)(b.f[2])),
|
||||
(float)((int)(a.f[3]) & (int)(b.f[3])) };
|
||||
|
||||
return result;
|
||||
}
|
||||
static __m128 _mm_or_ps(const __m128 &a, const __m128 &b)
|
||||
{
|
||||
__m128 result = {
|
||||
(float)((int)(a.f[0]) | (int)(b.f[0])),
|
||||
(float)((int)(a.f[1]) | (int)(b.f[1])),
|
||||
(float)((int)(a.f[2]) | (int)(b.f[2])),
|
||||
(float)((int)(a.f[3]) | (int)(b.f[3])) };
|
||||
|
||||
return result;
|
||||
}
|
||||
static __m128 _mm_add_ps(const __m128 &a, const __m128 &b)
|
||||
{
|
||||
__m128 result = {
|
||||
a.f[0] + b.f[0],
|
||||
a.f[1] + b.f[1],
|
||||
a.f[2] + b.f[2],
|
||||
a.f[3] + b.f[3] };
|
||||
|
||||
return result;
|
||||
}
|
||||
static __m128 _mm_mul_ps(const __m128 &a, const __m128 &b)
|
||||
{
|
||||
__m128 result = {
|
||||
a.f[0] * b.f[0],
|
||||
a.f[1] * b.f[1],
|
||||
a.f[2] * b.f[2],
|
||||
a.f[3] * b.f[3] };
|
||||
|
||||
return result;
|
||||
}
|
||||
static __m128 _mm_div_ps(const __m128 &a, const __m128 &b)
|
||||
{
|
||||
__m128 result = {
|
||||
a.f[0] / b.f[0],
|
||||
a.f[1] / b.f[1],
|
||||
a.f[2] / b.f[2],
|
||||
a.f[3] / b.f[3] };
|
||||
|
||||
return result;
|
||||
}
|
||||
static __m128 _mm_hadd_ps(const __m128 &a, const __m128 &b)
|
||||
{
|
||||
__m128 result = {
|
||||
a.f[0] + a.f[1],
|
||||
a.f[2] + a.f[3],
|
||||
b.f[0] + b.f[1],
|
||||
b.f[2] + b.f[3] };
|
||||
|
||||
return result;
|
||||
}
|
||||
static __m128 _mm_cmpge_ps(const __m128 &a, const __m128 &b)
|
||||
{
|
||||
__m128 result = {
|
||||
a.f[0] >= b.f[0] ? 1.0f : 0.0f,
|
||||
a.f[1] >= b.f[1] ? 1.0f : 0.0f,
|
||||
a.f[2] >= b.f[2] ? 1.0f : 0.0f,
|
||||
a.f[3] >= b.f[3] ? 1.0f : 0.0f };
|
||||
|
||||
return result;
|
||||
}
|
||||
static __m128 _mm_cmple_ps(const __m128 &a, const __m128 &b)
|
||||
{
|
||||
__m128 result = {
|
||||
a.f[0] <= b.f[0] ? 1.0f : 0.0f,
|
||||
a.f[1] <= b.f[1] ? 1.0f : 0.0f,
|
||||
a.f[2] <= b.f[2] ? 1.0f : 0.0f,
|
||||
a.f[3] <= b.f[3] ? 1.0f : 0.0f };
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#define _MM_TRANSPOSE4_PS( c1, c2, c3, c4 ) \
|
||||
{ \
|
||||
float4 t1, t2, t3, t4; \
|
||||
\
|
||||
t1.v.f[0] = c1.v.f[0]; \
|
||||
t1.v.f[1] = c2.v.f[0]; \
|
||||
t1.v.f[2] = c3.v.f[0]; \
|
||||
t1.v.f[3] = c4.v.f[0]; \
|
||||
\
|
||||
t2.v.f[0] = c1.v.f[1]; \
|
||||
t2.v.f[1] = c2.v.f[1]; \
|
||||
t2.v.f[2] = c3.v.f[1]; \
|
||||
t2.v.f[3] = c4.v.f[1]; \
|
||||
\
|
||||
t3.v.f[0] = c1.v.f[2]; \
|
||||
t3.v.f[1] = c2.v.f[2]; \
|
||||
t3.v.f[2] = c3.v.f[2]; \
|
||||
t3.v.f[3] = c4.v.f[2]; \
|
||||
\
|
||||
t4.v.f[0] = c1.v.f[3]; \
|
||||
t4.v.f[1] = c2.v.f[3]; \
|
||||
t4.v.f[2] = c3.v.f[3]; \
|
||||
t4.v.f[3] = c4.v.f[3]; \
|
||||
\
|
||||
c1 = t1; \
|
||||
c2 = t2; \
|
||||
c3 = t3; \
|
||||
c4 = t4; \
|
||||
}
|
||||
|
||||
#define _mm_prefetch(a, b)
|
||||
#endif
|
||||
|
||||
class float4
|
||||
{
|
||||
__m128 v; // value
|
||||
|
|
|
@ -3,6 +3,14 @@
|
|||
// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
|
||||
//
|
||||
#include "stdafx.h"
|
||||
|
||||
// This class implements a block handler based on the SSE intrinsics available on intel platforms.
|
||||
// Since we don't have SSE on ARM64 (NEON has similar functionality but is not identical) we cannot
|
||||
// use the BlockHandlerSSE implementation no ARM64.
|
||||
// Therefore, exclude the implementation on ARM64 builds for now.
|
||||
// TODO: In the future, we should provide a NEON based implementation instead.
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
@ -30,3 +38,5 @@ int BlockHandlerSSE::RowToColOffsetRewrittenB(int col, int kOffset, int blockSiz
|
|||
|
||||
|
||||
}}}
|
||||
|
||||
#endif
|
||||
|
|
Загрузка…
Ссылка в новой задаче