Added NEON support where SSE is supported (#1150)

* Added NEON support where SSE is supported * Adding Tests * Fixed x86 flag * Co-authored-by: Wes Barcalow <wesbarc@microsoft.com>
2020-04-14 05:43:25 +09:00 · 2020-04-14 05:43:25 +09:00 · b033d2e552
--- a/src/transformation/rgbz.c
+++ b/src/transformation/rgbz.c
@ -8,11 +8,14 @@
 #include <limits.h>
 #include <math.h>

-#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_X86)
+#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_IX86)
 #define K4A_USING_SSE
 #include <emmintrin.h> // SSE2
 #include <tmmintrin.h> // SSE3
 #include <smmintrin.h> // SSE4.1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define K4A_USING_NEON
+#include <arm_neon.h>
 #endif

 typedef struct _k4a_transformation_input_image_t
@ -57,6 +60,28 @@ typedef struct _k4a_bounding_box_t
    int bottom_right[2];
 } k4a_bounding_box_t;

+// g_transformation_instruction_type is set to SSE, NEON, None, or NULL
+static char g_transformation_instruction_type[5] = { 0 };
+
+// Share g_transformation_instruction_type with tests to confirm this is built correctly.
+char *transformation_get_instruction_type(void);
+char *transformation_get_instruction_type(void)
+{
+    return g_transformation_instruction_type;
+}
+
+// Set the special instruction
+static void set_special_instruction_optimization(char *opt)
+{
+    // Only set this once
+    if (g_transformation_instruction_type[0] == '\0')
+    {
+        size_t sz = MIN(sizeof(opt), sizeof(g_transformation_instruction_type) - 1);
+        memcpy(g_transformation_instruction_type, opt, sz);
+        LOG_INFO("Compiled special instruction type is: %s\n", opt);
+    }
+}
+
 static k4a_transformation_image_descriptor_t
 transformation_init_image_descriptor(int width, int height, int stride, k4a_image_format_t format)
 {
@ -1058,7 +1083,7 @@ k4a_buffer_result_t transformation_color_image_to_depth_camera_internal(
    return K4A_BUFFER_RESULT_SUCCEEDED;
 }

-#if !defined(K4A_USING_SSE)
+#if !defined(K4A_USING_SSE) && !defined(K4A_USING_NEON)
 // This is the same function as transformation_depth_to_xyz without the SSE
 // instructions. This code is kept here for readability.
 static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
@ -1069,6 +1094,8 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
    int16_t *xyz_data_int16 = (int16_t *)xyz_image_data;
    int16_t x, y, z;

+    set_special_instruction_optimization("None");
+
    for (int i = 0; i < xy_tables->width * xy_tables->height; i++)
    {
        float x_tab = xy_tables->x_table[i];
@ -1092,7 +1119,68 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
    }
 }

-#else
+#elif defined(K4A_USING_NEON)
+// convert from float to int using NEON is round to zero
+// make separate function to do floor
+static inline int32x4_t neon_floor(float32x4_t v)
+{
+    int32x4_t v0 = vcvtq_s32_f32(v);
+    int32x4_t a0 = vreinterpretq_s32_u32(vcgtq_f32(vcvtq_f32_s32(v0), v));
+    return vaddq_s32(v0, a0);
+}
+
+static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
+                                        const void *depth_image_data,
+                                        void *xyz_image_data)
+{
+    float *x_tab = (float *)xy_tables->x_table;
+    float *y_tab = (float *)xy_tables->y_table;
+    const uint16_t *depth_image_data_uint16 = (const uint16_t *)depth_image_data;
+    int16_t *xyz_data_int16 = (int16_t *)xyz_image_data;
+    float32x4_t half = vdupq_n_f32(0.5f);
+
+    set_special_instruction_optimization("NEON");
+
+    for (int i = 0; i < xy_tables->width * xy_tables->height / 8; i++)
+    {
+        // 8 elements in 1 loop
+        int offset = i * 8;
+        float32x4_t x_tab_lo = vld1q_f32(x_tab + offset);
+        float32x4_t x_tab_hi = vld1q_f32(x_tab + offset + 4);
+        // equivalent to isnan
+        uint32x4_t valid_lo = vceqq_f32(x_tab_lo, x_tab_lo);
+        uint32x4_t valid_hi = vceqq_f32(x_tab_hi, x_tab_hi);
+        // each element in valid is a mask which corresponds to isnan
+        uint16x8_t valid = vcombine_u16(vmovn_u32(valid_lo), vmovn_u32(valid_hi));
+        uint16x8_t v_0 = vandq_u16(vld1q_u16(depth_image_data_uint16 + offset), valid);
+        // v_z corresponds to z in naive code
+        int16x8_t v_z = vreinterpretq_s16_u16(v_0);
+        // expand v_z to compute x and y
+        float32x4_t v_z_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_0)));
+        float32x4_t v_z_hi = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_0)));
+        // load x_table and y_table
+        float32x4_t t_x_lo = vld1q_f32(x_tab + offset);
+        float32x4_t t_x_hi = vld1q_f32(x_tab + offset + 4);
+        float32x4_t t_y_lo = vld1q_f32(y_tab + offset);
+        float32x4_t t_y_hi = vld1q_f32(y_tab + offset + 4);
+        // main computation of x and y
+        int32x4_t v_x_lo = neon_floor(vmlaq_f32(half, v_z_lo, t_x_lo));
+        int32x4_t v_x_hi = neon_floor(vmlaq_f32(half, v_z_hi, t_x_hi));
+        int32x4_t v_y_lo = neon_floor(vmlaq_f32(half, v_z_lo, t_y_lo));
+        int32x4_t v_y_hi = neon_floor(vmlaq_f32(half, v_z_hi, t_y_hi));
+        int16x8_t v_x = vcombine_s16(vmovn_s32(v_x_lo), vmovn_s32(v_x_hi));
+        int16x8_t v_y = vcombine_s16(vmovn_s32(v_y_lo), vmovn_s32(v_y_hi));
+        // use scatter store instruction
+        int16x8x3_t store;
+        store.val[0] = v_x; // x0 x1 .. x14 x15
+        store.val[1] = v_y; // y0 y1 .. y14 y15
+        store.val[2] = v_z; // z0 z1 .. z14 z15
+        // x0 y0 z0 x1 y1 z1 .. x15 y15 z15
+        vst3q_s16(xyz_data_int16 + offset * 3, store);
+    }
+}
+
+#else /* defined(K4A_USING_SSE) */

 static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
                                        const void *depth_image_data,
@ -1110,6 +1198,8 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
    __m128 *y_table_m128 = (__m128 *)y_table;
    __m128i *xyz_data_m128i = (__m128i *)xyz_image_data;

+    set_special_instruction_optimization("SSE");
+
    const int16_t pos0 = 0x0100;
    const int16_t pos1 = 0x0302;
    const int16_t pos2 = 0x0504;
--- a/tests/Transformation/transformation.cpp
+++ b/tests/Transformation/transformation.cpp
@ -84,6 +84,9 @@ protected:
        ASSERT_EQ_FLT(A[2], B[2])                                                                                      \
    }

+// Export function from transformation.c to snoop on the compiler setting used.
+extern "C" char *transformation_get_instruction_type();
+
 static k4a_transformation_image_descriptor_t image_get_descriptor(const k4a_image_t image)
 {
    k4a_transformation_image_descriptor_t descriptor;
@ -405,6 +408,27 @@ TEST_F(transformation_ut, transformation_depth_image_to_point_cloud)
        ASSERT_EQ(check_sum, reference_val);
    }

+    {
+        // Are we compiled for the correct instruction type
+#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_IX86)
+#define SPECIAL_INSTRUCTION_OPTIMIZATION "SSE\0"
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define SPECIAL_INSTRUCTION_OPTIMIZATION "NEON"
+#else
+// Omit defining this when not SSE or NEON. Should result in a build break. We are either SSE or Neon.
+//#define SPECIAL_INSTRUCTION_OPTIMIZATION "None"
+#endif
+        char *compile_type = transformation_get_instruction_type();
+        ASSERT_NE(compile_type, (char *)nullptr);
+        ASSERT_NE(compile_type[0], '\0');
+        std::cout << "*** K4A Sensor SDK Compile type is: " << compile_type << " ***\n";
+        ASSERT_TRUE(memcmp(compile_type, SPECIAL_INSTRUCTION_OPTIMIZATION, strlen(compile_type)) == 0)
+            << "Expecting " << SPECIAL_INSTRUCTION_OPTIMIZATION << " but compiled for " << compile_type << "\n";
+        ASSERT_TRUE(memcmp(compile_type, SPECIAL_INSTRUCTION_OPTIMIZATION, strlen(compile_type)) == 0)
+            << "Expecting " << SPECIAL_INSTRUCTION_OPTIMIZATION << " but compiled for " << compile_type << "\n";
+        ASSERT_EQ(strlen(compile_type), strlen(SPECIAL_INSTRUCTION_OPTIMIZATION));
+    }
+
    image_dec_ref(depth_image);
    image_dec_ref(xyz_image);
    transformation_destroy(transformation_handle);
--- a/tests/multidevice/multidevice.cpp
+++ b/tests/multidevice/multidevice.cpp
@ -523,13 +523,13 @@ TEST_F(multidevice_sync_ft, multi_sync_validation)
    if (g_frame_rate != K4A_FRAMES_PER_SECOND_5 && g_frame_rate != K4A_FRAMES_PER_SECOND_15 &&
        g_frame_rate != K4A_FRAMES_PER_SECOND_30)
    {
-#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_X86)
-        printf("Using 5, 15, or 30FPS for AMD64/x86 build\n");
-        int frame_rate_rand = (int)RAND_VALUE(0, 2);
-#else
+#if defined(__aarch64__) || defined(_M_ARM64)
        // Jetson Nano can't handle 2 30FPS streams
        printf("Using 5 or 15FPS for ARM64 build\n");
        int frame_rate_rand = (int)RAND_VALUE(0, 1);
+#else
+        printf("Using 5, 15, or 30FPS for AMD64/x86 build\n");
+        int frame_rate_rand = (int)RAND_VALUE(0, 2);
 #endif
        switch (frame_rate_rand)
        {