Added NEON support where SSE is supported (#1150)

* Added NEON support where SSE is supported
* Adding Tests
* Fixed x86 flag
* Co-authored-by: Wes Barcalow <wesbarc@microsoft.com>
This commit is contained in:
Tomoaki Teshima 2020-04-14 05:43:25 +09:00 коммит произвёл GitHub
Родитель e287f73b03
Коммит b033d2e552
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 121 добавлений и 7 удалений

Просмотреть файл

@ -8,11 +8,14 @@
#include <limits.h> #include <limits.h>
#include <math.h> #include <math.h>
#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_X86) #if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_IX86)
#define K4A_USING_SSE #define K4A_USING_SSE
#include <emmintrin.h> // SSE2 #include <emmintrin.h> // SSE2
#include <tmmintrin.h> // SSE3 #include <tmmintrin.h> // SSE3
#include <smmintrin.h> // SSE4.1 #include <smmintrin.h> // SSE4.1
#elif defined(__aarch64__) || defined(_M_ARM64)
#define K4A_USING_NEON
#include <arm_neon.h>
#endif #endif
typedef struct _k4a_transformation_input_image_t typedef struct _k4a_transformation_input_image_t
@ -57,6 +60,28 @@ typedef struct _k4a_bounding_box_t
int bottom_right[2]; int bottom_right[2];
} k4a_bounding_box_t; } k4a_bounding_box_t;
// g_transformation_instruction_type is set to SSE, NEON, None, or NULL
static char g_transformation_instruction_type[5] = { 0 };
// Share g_transformation_instruction_type with tests to confirm this is built correctly.
char *transformation_get_instruction_type(void);
char *transformation_get_instruction_type(void)
{
return g_transformation_instruction_type;
}
// Set the special instruction
static void set_special_instruction_optimization(char *opt)
{
// Only set this once
if (g_transformation_instruction_type[0] == '\0')
{
size_t sz = MIN(sizeof(opt), sizeof(g_transformation_instruction_type) - 1);
memcpy(g_transformation_instruction_type, opt, sz);
LOG_INFO("Compiled special instruction type is: %s\n", opt);
}
}
static k4a_transformation_image_descriptor_t static k4a_transformation_image_descriptor_t
transformation_init_image_descriptor(int width, int height, int stride, k4a_image_format_t format) transformation_init_image_descriptor(int width, int height, int stride, k4a_image_format_t format)
{ {
@ -1058,7 +1083,7 @@ k4a_buffer_result_t transformation_color_image_to_depth_camera_internal(
return K4A_BUFFER_RESULT_SUCCEEDED; return K4A_BUFFER_RESULT_SUCCEEDED;
} }
#if !defined(K4A_USING_SSE) #if !defined(K4A_USING_SSE) && !defined(K4A_USING_NEON)
// This is the same function as transformation_depth_to_xyz without the SSE // This is the same function as transformation_depth_to_xyz without the SSE
// instructions. This code is kept here for readability. // instructions. This code is kept here for readability.
static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables, static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
@ -1069,6 +1094,8 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
int16_t *xyz_data_int16 = (int16_t *)xyz_image_data; int16_t *xyz_data_int16 = (int16_t *)xyz_image_data;
int16_t x, y, z; int16_t x, y, z;
set_special_instruction_optimization("None");
for (int i = 0; i < xy_tables->width * xy_tables->height; i++) for (int i = 0; i < xy_tables->width * xy_tables->height; i++)
{ {
float x_tab = xy_tables->x_table[i]; float x_tab = xy_tables->x_table[i];
@ -1092,7 +1119,68 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
} }
} }
#else #elif defined(K4A_USING_NEON)
// convert from float to int using NEON is round to zero
// make separate function to do floor
static inline int32x4_t neon_floor(float32x4_t v)
{
int32x4_t v0 = vcvtq_s32_f32(v);
int32x4_t a0 = vreinterpretq_s32_u32(vcgtq_f32(vcvtq_f32_s32(v0), v));
return vaddq_s32(v0, a0);
}
static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
const void *depth_image_data,
void *xyz_image_data)
{
float *x_tab = (float *)xy_tables->x_table;
float *y_tab = (float *)xy_tables->y_table;
const uint16_t *depth_image_data_uint16 = (const uint16_t *)depth_image_data;
int16_t *xyz_data_int16 = (int16_t *)xyz_image_data;
float32x4_t half = vdupq_n_f32(0.5f);
set_special_instruction_optimization("NEON");
for (int i = 0; i < xy_tables->width * xy_tables->height / 8; i++)
{
// 8 elements in 1 loop
int offset = i * 8;
float32x4_t x_tab_lo = vld1q_f32(x_tab + offset);
float32x4_t x_tab_hi = vld1q_f32(x_tab + offset + 4);
// equivalent to isnan
uint32x4_t valid_lo = vceqq_f32(x_tab_lo, x_tab_lo);
uint32x4_t valid_hi = vceqq_f32(x_tab_hi, x_tab_hi);
// each element in valid is a mask which corresponds to isnan
uint16x8_t valid = vcombine_u16(vmovn_u32(valid_lo), vmovn_u32(valid_hi));
uint16x8_t v_0 = vandq_u16(vld1q_u16(depth_image_data_uint16 + offset), valid);
// v_z corresponds to z in naive code
int16x8_t v_z = vreinterpretq_s16_u16(v_0);
// expand v_z to compute x and y
float32x4_t v_z_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_0)));
float32x4_t v_z_hi = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_0)));
// load x_table and y_table
float32x4_t t_x_lo = vld1q_f32(x_tab + offset);
float32x4_t t_x_hi = vld1q_f32(x_tab + offset + 4);
float32x4_t t_y_lo = vld1q_f32(y_tab + offset);
float32x4_t t_y_hi = vld1q_f32(y_tab + offset + 4);
// main computation of x and y
int32x4_t v_x_lo = neon_floor(vmlaq_f32(half, v_z_lo, t_x_lo));
int32x4_t v_x_hi = neon_floor(vmlaq_f32(half, v_z_hi, t_x_hi));
int32x4_t v_y_lo = neon_floor(vmlaq_f32(half, v_z_lo, t_y_lo));
int32x4_t v_y_hi = neon_floor(vmlaq_f32(half, v_z_hi, t_y_hi));
int16x8_t v_x = vcombine_s16(vmovn_s32(v_x_lo), vmovn_s32(v_x_hi));
int16x8_t v_y = vcombine_s16(vmovn_s32(v_y_lo), vmovn_s32(v_y_hi));
// use scatter store instruction
int16x8x3_t store;
store.val[0] = v_x; // x0 x1 .. x14 x15
store.val[1] = v_y; // y0 y1 .. y14 y15
store.val[2] = v_z; // z0 z1 .. z14 z15
// x0 y0 z0 x1 y1 z1 .. x15 y15 z15
vst3q_s16(xyz_data_int16 + offset * 3, store);
}
}
#else /* defined(K4A_USING_SSE) */
static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables, static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
const void *depth_image_data, const void *depth_image_data,
@ -1110,6 +1198,8 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
__m128 *y_table_m128 = (__m128 *)y_table; __m128 *y_table_m128 = (__m128 *)y_table;
__m128i *xyz_data_m128i = (__m128i *)xyz_image_data; __m128i *xyz_data_m128i = (__m128i *)xyz_image_data;
set_special_instruction_optimization("SSE");
const int16_t pos0 = 0x0100; const int16_t pos0 = 0x0100;
const int16_t pos1 = 0x0302; const int16_t pos1 = 0x0302;
const int16_t pos2 = 0x0504; const int16_t pos2 = 0x0504;

Просмотреть файл

@ -84,6 +84,9 @@ protected:
ASSERT_EQ_FLT(A[2], B[2]) \ ASSERT_EQ_FLT(A[2], B[2]) \
} }
// Export function from transformation.c to snoop on the compiler setting used.
extern "C" char *transformation_get_instruction_type();
static k4a_transformation_image_descriptor_t image_get_descriptor(const k4a_image_t image) static k4a_transformation_image_descriptor_t image_get_descriptor(const k4a_image_t image)
{ {
k4a_transformation_image_descriptor_t descriptor; k4a_transformation_image_descriptor_t descriptor;
@ -405,6 +408,27 @@ TEST_F(transformation_ut, transformation_depth_image_to_point_cloud)
ASSERT_EQ(check_sum, reference_val); ASSERT_EQ(check_sum, reference_val);
} }
{
// Are we compiled for the correct instruction type
#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_IX86)
#define SPECIAL_INSTRUCTION_OPTIMIZATION "SSE\0"
#elif defined(__aarch64__) || defined(_M_ARM64)
#define SPECIAL_INSTRUCTION_OPTIMIZATION "NEON"
#else
// Omit defining this when not SSE or NEON. Should result in a build break. We are either SSE or Neon.
//#define SPECIAL_INSTRUCTION_OPTIMIZATION "None"
#endif
char *compile_type = transformation_get_instruction_type();
ASSERT_NE(compile_type, (char *)nullptr);
ASSERT_NE(compile_type[0], '\0');
std::cout << "*** K4A Sensor SDK Compile type is: " << compile_type << " ***\n";
ASSERT_TRUE(memcmp(compile_type, SPECIAL_INSTRUCTION_OPTIMIZATION, strlen(compile_type)) == 0)
<< "Expecting " << SPECIAL_INSTRUCTION_OPTIMIZATION << " but compiled for " << compile_type << "\n";
ASSERT_TRUE(memcmp(compile_type, SPECIAL_INSTRUCTION_OPTIMIZATION, strlen(compile_type)) == 0)
<< "Expecting " << SPECIAL_INSTRUCTION_OPTIMIZATION << " but compiled for " << compile_type << "\n";
ASSERT_EQ(strlen(compile_type), strlen(SPECIAL_INSTRUCTION_OPTIMIZATION));
}
image_dec_ref(depth_image); image_dec_ref(depth_image);
image_dec_ref(xyz_image); image_dec_ref(xyz_image);
transformation_destroy(transformation_handle); transformation_destroy(transformation_handle);

Просмотреть файл

@ -523,13 +523,13 @@ TEST_F(multidevice_sync_ft, multi_sync_validation)
if (g_frame_rate != K4A_FRAMES_PER_SECOND_5 && g_frame_rate != K4A_FRAMES_PER_SECOND_15 && if (g_frame_rate != K4A_FRAMES_PER_SECOND_5 && g_frame_rate != K4A_FRAMES_PER_SECOND_15 &&
g_frame_rate != K4A_FRAMES_PER_SECOND_30) g_frame_rate != K4A_FRAMES_PER_SECOND_30)
{ {
#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_X86) #if defined(__aarch64__) || defined(_M_ARM64)
printf("Using 5, 15, or 30FPS for AMD64/x86 build\n");
int frame_rate_rand = (int)RAND_VALUE(0, 2);
#else
// Jetson Nano can't handle 2 30FPS streams // Jetson Nano can't handle 2 30FPS streams
printf("Using 5 or 15FPS for ARM64 build\n"); printf("Using 5 or 15FPS for ARM64 build\n");
int frame_rate_rand = (int)RAND_VALUE(0, 1); int frame_rate_rand = (int)RAND_VALUE(0, 1);
#else
printf("Using 5, 15, or 30FPS for AMD64/x86 build\n");
int frame_rate_rand = (int)RAND_VALUE(0, 2);
#endif #endif
switch (frame_rate_rand) switch (frame_rate_rand)
{ {