diff --git a/src/transformation/rgbz.c b/src/transformation/rgbz.c index 1cc4afac..4243d6fb 100644 --- a/src/transformation/rgbz.c +++ b/src/transformation/rgbz.c @@ -8,11 +8,14 @@ #include #include -#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_X86) +#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_IX86) #define K4A_USING_SSE #include // SSE2 #include // SSE3 #include // SSE4.1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#define K4A_USING_NEON +#include #endif typedef struct _k4a_transformation_input_image_t @@ -57,6 +60,28 @@ typedef struct _k4a_bounding_box_t int bottom_right[2]; } k4a_bounding_box_t; +// g_transformation_instruction_type is set to SSE, NEON, None, or NULL +static char g_transformation_instruction_type[5] = { 0 }; + +// Share g_transformation_instruction_type with tests to confirm this is built correctly. +char *transformation_get_instruction_type(void); +char *transformation_get_instruction_type(void) +{ + return g_transformation_instruction_type; +} + +// Set the special instruction +static void set_special_instruction_optimization(char *opt) +{ + // Only set this once + if (g_transformation_instruction_type[0] == '\0') + { + size_t sz = MIN(sizeof(opt), sizeof(g_transformation_instruction_type) - 1); + memcpy(g_transformation_instruction_type, opt, sz); + LOG_INFO("Compiled special instruction type is: %s\n", opt); + } +} + static k4a_transformation_image_descriptor_t transformation_init_image_descriptor(int width, int height, int stride, k4a_image_format_t format) { @@ -1058,7 +1083,7 @@ k4a_buffer_result_t transformation_color_image_to_depth_camera_internal( return K4A_BUFFER_RESULT_SUCCEEDED; } -#if !defined(K4A_USING_SSE) +#if !defined(K4A_USING_SSE) && !defined(K4A_USING_NEON) // This is the same function as transformation_depth_to_xyz without the SSE // instructions. This code is kept here for readability. static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables, @@ -1069,6 +1094,8 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table int16_t *xyz_data_int16 = (int16_t *)xyz_image_data; int16_t x, y, z; + set_special_instruction_optimization("None"); + for (int i = 0; i < xy_tables->width * xy_tables->height; i++) { float x_tab = xy_tables->x_table[i]; @@ -1092,7 +1119,68 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table } } -#else +#elif defined(K4A_USING_NEON) +// convert from float to int using NEON is round to zero +// make separate function to do floor +static inline int32x4_t neon_floor(float32x4_t v) +{ + int32x4_t v0 = vcvtq_s32_f32(v); + int32x4_t a0 = vreinterpretq_s32_u32(vcgtq_f32(vcvtq_f32_s32(v0), v)); + return vaddq_s32(v0, a0); +} + +static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables, + const void *depth_image_data, + void *xyz_image_data) +{ + float *x_tab = (float *)xy_tables->x_table; + float *y_tab = (float *)xy_tables->y_table; + const uint16_t *depth_image_data_uint16 = (const uint16_t *)depth_image_data; + int16_t *xyz_data_int16 = (int16_t *)xyz_image_data; + float32x4_t half = vdupq_n_f32(0.5f); + + set_special_instruction_optimization("NEON"); + + for (int i = 0; i < xy_tables->width * xy_tables->height / 8; i++) + { + // 8 elements in 1 loop + int offset = i * 8; + float32x4_t x_tab_lo = vld1q_f32(x_tab + offset); + float32x4_t x_tab_hi = vld1q_f32(x_tab + offset + 4); + // equivalent to isnan + uint32x4_t valid_lo = vceqq_f32(x_tab_lo, x_tab_lo); + uint32x4_t valid_hi = vceqq_f32(x_tab_hi, x_tab_hi); + // each element in valid is a mask which corresponds to isnan + uint16x8_t valid = vcombine_u16(vmovn_u32(valid_lo), vmovn_u32(valid_hi)); + uint16x8_t v_0 = vandq_u16(vld1q_u16(depth_image_data_uint16 + offset), valid); + // v_z corresponds to z in naive code + int16x8_t v_z = vreinterpretq_s16_u16(v_0); + // expand v_z to compute x and y + float32x4_t v_z_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_0))); + float32x4_t v_z_hi = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_0))); + // load x_table and y_table + float32x4_t t_x_lo = vld1q_f32(x_tab + offset); + float32x4_t t_x_hi = vld1q_f32(x_tab + offset + 4); + float32x4_t t_y_lo = vld1q_f32(y_tab + offset); + float32x4_t t_y_hi = vld1q_f32(y_tab + offset + 4); + // main computation of x and y + int32x4_t v_x_lo = neon_floor(vmlaq_f32(half, v_z_lo, t_x_lo)); + int32x4_t v_x_hi = neon_floor(vmlaq_f32(half, v_z_hi, t_x_hi)); + int32x4_t v_y_lo = neon_floor(vmlaq_f32(half, v_z_lo, t_y_lo)); + int32x4_t v_y_hi = neon_floor(vmlaq_f32(half, v_z_hi, t_y_hi)); + int16x8_t v_x = vcombine_s16(vmovn_s32(v_x_lo), vmovn_s32(v_x_hi)); + int16x8_t v_y = vcombine_s16(vmovn_s32(v_y_lo), vmovn_s32(v_y_hi)); + // use scatter store instruction + int16x8x3_t store; + store.val[0] = v_x; // x0 x1 .. x14 x15 + store.val[1] = v_y; // y0 y1 .. y14 y15 + store.val[2] = v_z; // z0 z1 .. z14 z15 + // x0 y0 z0 x1 y1 z1 .. x15 y15 z15 + vst3q_s16(xyz_data_int16 + offset * 3, store); + } +} + +#else /* defined(K4A_USING_SSE) */ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables, const void *depth_image_data, @@ -1110,6 +1198,8 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table __m128 *y_table_m128 = (__m128 *)y_table; __m128i *xyz_data_m128i = (__m128i *)xyz_image_data; + set_special_instruction_optimization("SSE"); + const int16_t pos0 = 0x0100; const int16_t pos1 = 0x0302; const int16_t pos2 = 0x0504; diff --git a/tests/Transformation/transformation.cpp b/tests/Transformation/transformation.cpp index 1e5fbd92..f5e4bd14 100644 --- a/tests/Transformation/transformation.cpp +++ b/tests/Transformation/transformation.cpp @@ -84,6 +84,9 @@ protected: ASSERT_EQ_FLT(A[2], B[2]) \ } +// Export function from transformation.c to snoop on the compiler setting used. +extern "C" char *transformation_get_instruction_type(); + static k4a_transformation_image_descriptor_t image_get_descriptor(const k4a_image_t image) { k4a_transformation_image_descriptor_t descriptor; @@ -405,6 +408,27 @@ TEST_F(transformation_ut, transformation_depth_image_to_point_cloud) ASSERT_EQ(check_sum, reference_val); } + { + // Are we compiled for the correct instruction type +#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_IX86) +#define SPECIAL_INSTRUCTION_OPTIMIZATION "SSE\0" +#elif defined(__aarch64__) || defined(_M_ARM64) +#define SPECIAL_INSTRUCTION_OPTIMIZATION "NEON" +#else +// Omit defining this when not SSE or NEON. Should result in a build break. We are either SSE or Neon. +//#define SPECIAL_INSTRUCTION_OPTIMIZATION "None" +#endif + char *compile_type = transformation_get_instruction_type(); + ASSERT_NE(compile_type, (char *)nullptr); + ASSERT_NE(compile_type[0], '\0'); + std::cout << "*** K4A Sensor SDK Compile type is: " << compile_type << " ***\n"; + ASSERT_TRUE(memcmp(compile_type, SPECIAL_INSTRUCTION_OPTIMIZATION, strlen(compile_type)) == 0) + << "Expecting " << SPECIAL_INSTRUCTION_OPTIMIZATION << " but compiled for " << compile_type << "\n"; + ASSERT_TRUE(memcmp(compile_type, SPECIAL_INSTRUCTION_OPTIMIZATION, strlen(compile_type)) == 0) + << "Expecting " << SPECIAL_INSTRUCTION_OPTIMIZATION << " but compiled for " << compile_type << "\n"; + ASSERT_EQ(strlen(compile_type), strlen(SPECIAL_INSTRUCTION_OPTIMIZATION)); + } + image_dec_ref(depth_image); image_dec_ref(xyz_image); transformation_destroy(transformation_handle); diff --git a/tests/multidevice/multidevice.cpp b/tests/multidevice/multidevice.cpp index 81915079..9ea14d7a 100644 --- a/tests/multidevice/multidevice.cpp +++ b/tests/multidevice/multidevice.cpp @@ -523,13 +523,13 @@ TEST_F(multidevice_sync_ft, multi_sync_validation) if (g_frame_rate != K4A_FRAMES_PER_SECOND_5 && g_frame_rate != K4A_FRAMES_PER_SECOND_15 && g_frame_rate != K4A_FRAMES_PER_SECOND_30) { -#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_X86) - printf("Using 5, 15, or 30FPS for AMD64/x86 build\n"); - int frame_rate_rand = (int)RAND_VALUE(0, 2); -#else +#if defined(__aarch64__) || defined(_M_ARM64) // Jetson Nano can't handle 2 30FPS streams printf("Using 5 or 15FPS for ARM64 build\n"); int frame_rate_rand = (int)RAND_VALUE(0, 1); +#else + printf("Using 5, 15, or 30FPS for AMD64/x86 build\n"); + int frame_rate_rand = (int)RAND_VALUE(0, 2); #endif switch (frame_rate_rand) {