Added NEON support where SSE is supported (#1150)
* Added NEON support where SSE is supported * Adding Tests * Fixed x86 flag * Co-authored-by: Wes Barcalow <wesbarc@microsoft.com>
This commit is contained in:
Родитель
e287f73b03
Коммит
b033d2e552
|
@ -8,11 +8,14 @@
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_X86)
|
#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_IX86)
|
||||||
#define K4A_USING_SSE
|
#define K4A_USING_SSE
|
||||||
#include <emmintrin.h> // SSE2
|
#include <emmintrin.h> // SSE2
|
||||||
#include <tmmintrin.h> // SSE3
|
#include <tmmintrin.h> // SSE3
|
||||||
#include <smmintrin.h> // SSE4.1
|
#include <smmintrin.h> // SSE4.1
|
||||||
|
#elif defined(__aarch64__) || defined(_M_ARM64)
|
||||||
|
#define K4A_USING_NEON
|
||||||
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct _k4a_transformation_input_image_t
|
typedef struct _k4a_transformation_input_image_t
|
||||||
|
@ -57,6 +60,28 @@ typedef struct _k4a_bounding_box_t
|
||||||
int bottom_right[2];
|
int bottom_right[2];
|
||||||
} k4a_bounding_box_t;
|
} k4a_bounding_box_t;
|
||||||
|
|
||||||
|
// g_transformation_instruction_type is set to SSE, NEON, None, or NULL
|
||||||
|
static char g_transformation_instruction_type[5] = { 0 };
|
||||||
|
|
||||||
|
// Share g_transformation_instruction_type with tests to confirm this is built correctly.
|
||||||
|
char *transformation_get_instruction_type(void);
|
||||||
|
char *transformation_get_instruction_type(void)
|
||||||
|
{
|
||||||
|
return g_transformation_instruction_type;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the special instruction
|
||||||
|
static void set_special_instruction_optimization(char *opt)
|
||||||
|
{
|
||||||
|
// Only set this once
|
||||||
|
if (g_transformation_instruction_type[0] == '\0')
|
||||||
|
{
|
||||||
|
size_t sz = MIN(sizeof(opt), sizeof(g_transformation_instruction_type) - 1);
|
||||||
|
memcpy(g_transformation_instruction_type, opt, sz);
|
||||||
|
LOG_INFO("Compiled special instruction type is: %s\n", opt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static k4a_transformation_image_descriptor_t
|
static k4a_transformation_image_descriptor_t
|
||||||
transformation_init_image_descriptor(int width, int height, int stride, k4a_image_format_t format)
|
transformation_init_image_descriptor(int width, int height, int stride, k4a_image_format_t format)
|
||||||
{
|
{
|
||||||
|
@ -1058,7 +1083,7 @@ k4a_buffer_result_t transformation_color_image_to_depth_camera_internal(
|
||||||
return K4A_BUFFER_RESULT_SUCCEEDED;
|
return K4A_BUFFER_RESULT_SUCCEEDED;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !defined(K4A_USING_SSE)
|
#if !defined(K4A_USING_SSE) && !defined(K4A_USING_NEON)
|
||||||
// This is the same function as transformation_depth_to_xyz without the SSE
|
// This is the same function as transformation_depth_to_xyz without the SSE
|
||||||
// instructions. This code is kept here for readability.
|
// instructions. This code is kept here for readability.
|
||||||
static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
|
static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
|
||||||
|
@ -1069,6 +1094,8 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
|
||||||
int16_t *xyz_data_int16 = (int16_t *)xyz_image_data;
|
int16_t *xyz_data_int16 = (int16_t *)xyz_image_data;
|
||||||
int16_t x, y, z;
|
int16_t x, y, z;
|
||||||
|
|
||||||
|
set_special_instruction_optimization("None");
|
||||||
|
|
||||||
for (int i = 0; i < xy_tables->width * xy_tables->height; i++)
|
for (int i = 0; i < xy_tables->width * xy_tables->height; i++)
|
||||||
{
|
{
|
||||||
float x_tab = xy_tables->x_table[i];
|
float x_tab = xy_tables->x_table[i];
|
||||||
|
@ -1092,7 +1119,68 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#elif defined(K4A_USING_NEON)
|
||||||
|
// convert from float to int using NEON is round to zero
|
||||||
|
// make separate function to do floor
|
||||||
|
static inline int32x4_t neon_floor(float32x4_t v)
|
||||||
|
{
|
||||||
|
int32x4_t v0 = vcvtq_s32_f32(v);
|
||||||
|
int32x4_t a0 = vreinterpretq_s32_u32(vcgtq_f32(vcvtq_f32_s32(v0), v));
|
||||||
|
return vaddq_s32(v0, a0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
|
||||||
|
const void *depth_image_data,
|
||||||
|
void *xyz_image_data)
|
||||||
|
{
|
||||||
|
float *x_tab = (float *)xy_tables->x_table;
|
||||||
|
float *y_tab = (float *)xy_tables->y_table;
|
||||||
|
const uint16_t *depth_image_data_uint16 = (const uint16_t *)depth_image_data;
|
||||||
|
int16_t *xyz_data_int16 = (int16_t *)xyz_image_data;
|
||||||
|
float32x4_t half = vdupq_n_f32(0.5f);
|
||||||
|
|
||||||
|
set_special_instruction_optimization("NEON");
|
||||||
|
|
||||||
|
for (int i = 0; i < xy_tables->width * xy_tables->height / 8; i++)
|
||||||
|
{
|
||||||
|
// 8 elements in 1 loop
|
||||||
|
int offset = i * 8;
|
||||||
|
float32x4_t x_tab_lo = vld1q_f32(x_tab + offset);
|
||||||
|
float32x4_t x_tab_hi = vld1q_f32(x_tab + offset + 4);
|
||||||
|
// equivalent to isnan
|
||||||
|
uint32x4_t valid_lo = vceqq_f32(x_tab_lo, x_tab_lo);
|
||||||
|
uint32x4_t valid_hi = vceqq_f32(x_tab_hi, x_tab_hi);
|
||||||
|
// each element in valid is a mask which corresponds to isnan
|
||||||
|
uint16x8_t valid = vcombine_u16(vmovn_u32(valid_lo), vmovn_u32(valid_hi));
|
||||||
|
uint16x8_t v_0 = vandq_u16(vld1q_u16(depth_image_data_uint16 + offset), valid);
|
||||||
|
// v_z corresponds to z in naive code
|
||||||
|
int16x8_t v_z = vreinterpretq_s16_u16(v_0);
|
||||||
|
// expand v_z to compute x and y
|
||||||
|
float32x4_t v_z_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_0)));
|
||||||
|
float32x4_t v_z_hi = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_0)));
|
||||||
|
// load x_table and y_table
|
||||||
|
float32x4_t t_x_lo = vld1q_f32(x_tab + offset);
|
||||||
|
float32x4_t t_x_hi = vld1q_f32(x_tab + offset + 4);
|
||||||
|
float32x4_t t_y_lo = vld1q_f32(y_tab + offset);
|
||||||
|
float32x4_t t_y_hi = vld1q_f32(y_tab + offset + 4);
|
||||||
|
// main computation of x and y
|
||||||
|
int32x4_t v_x_lo = neon_floor(vmlaq_f32(half, v_z_lo, t_x_lo));
|
||||||
|
int32x4_t v_x_hi = neon_floor(vmlaq_f32(half, v_z_hi, t_x_hi));
|
||||||
|
int32x4_t v_y_lo = neon_floor(vmlaq_f32(half, v_z_lo, t_y_lo));
|
||||||
|
int32x4_t v_y_hi = neon_floor(vmlaq_f32(half, v_z_hi, t_y_hi));
|
||||||
|
int16x8_t v_x = vcombine_s16(vmovn_s32(v_x_lo), vmovn_s32(v_x_hi));
|
||||||
|
int16x8_t v_y = vcombine_s16(vmovn_s32(v_y_lo), vmovn_s32(v_y_hi));
|
||||||
|
// use scatter store instruction
|
||||||
|
int16x8x3_t store;
|
||||||
|
store.val[0] = v_x; // x0 x1 .. x14 x15
|
||||||
|
store.val[1] = v_y; // y0 y1 .. y14 y15
|
||||||
|
store.val[2] = v_z; // z0 z1 .. z14 z15
|
||||||
|
// x0 y0 z0 x1 y1 z1 .. x15 y15 z15
|
||||||
|
vst3q_s16(xyz_data_int16 + offset * 3, store);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#else /* defined(K4A_USING_SSE) */
|
||||||
|
|
||||||
static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
|
static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
|
||||||
const void *depth_image_data,
|
const void *depth_image_data,
|
||||||
|
@ -1110,6 +1198,8 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
|
||||||
__m128 *y_table_m128 = (__m128 *)y_table;
|
__m128 *y_table_m128 = (__m128 *)y_table;
|
||||||
__m128i *xyz_data_m128i = (__m128i *)xyz_image_data;
|
__m128i *xyz_data_m128i = (__m128i *)xyz_image_data;
|
||||||
|
|
||||||
|
set_special_instruction_optimization("SSE");
|
||||||
|
|
||||||
const int16_t pos0 = 0x0100;
|
const int16_t pos0 = 0x0100;
|
||||||
const int16_t pos1 = 0x0302;
|
const int16_t pos1 = 0x0302;
|
||||||
const int16_t pos2 = 0x0504;
|
const int16_t pos2 = 0x0504;
|
||||||
|
|
|
@ -84,6 +84,9 @@ protected:
|
||||||
ASSERT_EQ_FLT(A[2], B[2]) \
|
ASSERT_EQ_FLT(A[2], B[2]) \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Export function from transformation.c to snoop on the compiler setting used.
|
||||||
|
extern "C" char *transformation_get_instruction_type();
|
||||||
|
|
||||||
static k4a_transformation_image_descriptor_t image_get_descriptor(const k4a_image_t image)
|
static k4a_transformation_image_descriptor_t image_get_descriptor(const k4a_image_t image)
|
||||||
{
|
{
|
||||||
k4a_transformation_image_descriptor_t descriptor;
|
k4a_transformation_image_descriptor_t descriptor;
|
||||||
|
@ -405,6 +408,27 @@ TEST_F(transformation_ut, transformation_depth_image_to_point_cloud)
|
||||||
ASSERT_EQ(check_sum, reference_val);
|
ASSERT_EQ(check_sum, reference_val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Are we compiled for the correct instruction type
|
||||||
|
#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_IX86)
|
||||||
|
#define SPECIAL_INSTRUCTION_OPTIMIZATION "SSE\0"
|
||||||
|
#elif defined(__aarch64__) || defined(_M_ARM64)
|
||||||
|
#define SPECIAL_INSTRUCTION_OPTIMIZATION "NEON"
|
||||||
|
#else
|
||||||
|
// Omit defining this when not SSE or NEON. Should result in a build break. We are either SSE or Neon.
|
||||||
|
//#define SPECIAL_INSTRUCTION_OPTIMIZATION "None"
|
||||||
|
#endif
|
||||||
|
char *compile_type = transformation_get_instruction_type();
|
||||||
|
ASSERT_NE(compile_type, (char *)nullptr);
|
||||||
|
ASSERT_NE(compile_type[0], '\0');
|
||||||
|
std::cout << "*** K4A Sensor SDK Compile type is: " << compile_type << " ***\n";
|
||||||
|
ASSERT_TRUE(memcmp(compile_type, SPECIAL_INSTRUCTION_OPTIMIZATION, strlen(compile_type)) == 0)
|
||||||
|
<< "Expecting " << SPECIAL_INSTRUCTION_OPTIMIZATION << " but compiled for " << compile_type << "\n";
|
||||||
|
ASSERT_TRUE(memcmp(compile_type, SPECIAL_INSTRUCTION_OPTIMIZATION, strlen(compile_type)) == 0)
|
||||||
|
<< "Expecting " << SPECIAL_INSTRUCTION_OPTIMIZATION << " but compiled for " << compile_type << "\n";
|
||||||
|
ASSERT_EQ(strlen(compile_type), strlen(SPECIAL_INSTRUCTION_OPTIMIZATION));
|
||||||
|
}
|
||||||
|
|
||||||
image_dec_ref(depth_image);
|
image_dec_ref(depth_image);
|
||||||
image_dec_ref(xyz_image);
|
image_dec_ref(xyz_image);
|
||||||
transformation_destroy(transformation_handle);
|
transformation_destroy(transformation_handle);
|
||||||
|
|
|
@ -523,13 +523,13 @@ TEST_F(multidevice_sync_ft, multi_sync_validation)
|
||||||
if (g_frame_rate != K4A_FRAMES_PER_SECOND_5 && g_frame_rate != K4A_FRAMES_PER_SECOND_15 &&
|
if (g_frame_rate != K4A_FRAMES_PER_SECOND_5 && g_frame_rate != K4A_FRAMES_PER_SECOND_15 &&
|
||||||
g_frame_rate != K4A_FRAMES_PER_SECOND_30)
|
g_frame_rate != K4A_FRAMES_PER_SECOND_30)
|
||||||
{
|
{
|
||||||
#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_X86)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
printf("Using 5, 15, or 30FPS for AMD64/x86 build\n");
|
|
||||||
int frame_rate_rand = (int)RAND_VALUE(0, 2);
|
|
||||||
#else
|
|
||||||
// Jetson Nano can't handle 2 30FPS streams
|
// Jetson Nano can't handle 2 30FPS streams
|
||||||
printf("Using 5 or 15FPS for ARM64 build\n");
|
printf("Using 5 or 15FPS for ARM64 build\n");
|
||||||
int frame_rate_rand = (int)RAND_VALUE(0, 1);
|
int frame_rate_rand = (int)RAND_VALUE(0, 1);
|
||||||
|
#else
|
||||||
|
printf("Using 5, 15, or 30FPS for AMD64/x86 build\n");
|
||||||
|
int frame_rate_rand = (int)RAND_VALUE(0, 2);
|
||||||
#endif
|
#endif
|
||||||
switch (frame_rate_rand)
|
switch (frame_rate_rand)
|
||||||
{
|
{
|
||||||
|
|
Загрузка…
Ссылка в новой задаче