Skip to content
This repository was archived by the owner on Aug 22, 2024. It is now read-only.

add NEON support where SSE is supported #1150

Merged
merged 6 commits into from
Apr 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 93 additions & 3 deletions src/transformation/rgbz.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@
#include <limits.h>
#include <math.h>

#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_X86)
#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_IX86)
#define K4A_USING_SSE
#include <emmintrin.h> // SSE2
#include <tmmintrin.h> // SSE3
#include <smmintrin.h> // SSE4.1
#elif defined(__aarch64__) || defined(_M_ARM64)
#define K4A_USING_NEON
#include <arm_neon.h>
#endif

typedef struct _k4a_transformation_input_image_t
Expand Down Expand Up @@ -57,6 +60,28 @@ typedef struct _k4a_bounding_box_t
int bottom_right[2];
} k4a_bounding_box_t;

// g_transformation_instruction_type is set to SSE, NEON, None, or NULL
static char g_transformation_instruction_type[5] = { 0 };

// Share g_transformation_instruction_type with tests to confirm this is built correctly.
char *transformation_get_instruction_type(void);
char *transformation_get_instruction_type(void)
{
return g_transformation_instruction_type;
}

// Set the special instruction
static void set_special_instruction_optimization(char *opt)
{
// Only set this once
if (g_transformation_instruction_type[0] == '\0')
{
size_t sz = MIN(sizeof(opt), sizeof(g_transformation_instruction_type) - 1);
memcpy(g_transformation_instruction_type, opt, sz);
LOG_INFO("Compiled special instruction type is: %s\n", opt);
}
}

static k4a_transformation_image_descriptor_t
transformation_init_image_descriptor(int width, int height, int stride, k4a_image_format_t format)
{
Expand Down Expand Up @@ -1058,7 +1083,7 @@ k4a_buffer_result_t transformation_color_image_to_depth_camera_internal(
return K4A_BUFFER_RESULT_SUCCEEDED;
}

#if !defined(K4A_USING_SSE)
#if !defined(K4A_USING_SSE) && !defined(K4A_USING_NEON)
// This is the same function as transformation_depth_to_xyz without the SSE
// instructions. This code is kept here for readability.
static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
Expand All @@ -1069,6 +1094,8 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
int16_t *xyz_data_int16 = (int16_t *)xyz_image_data;
int16_t x, y, z;

set_special_instruction_optimization("None");

for (int i = 0; i < xy_tables->width * xy_tables->height; i++)
{
float x_tab = xy_tables->x_table[i];
Expand All @@ -1092,7 +1119,68 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
}
}

#else
#elif defined(K4A_USING_NEON)
// convert from float to int using NEON is round to zero
// make separate function to do floor
static inline int32x4_t neon_floor(float32x4_t v)
{
int32x4_t v0 = vcvtq_s32_f32(v);
int32x4_t a0 = vreinterpretq_s32_u32(vcgtq_f32(vcvtq_f32_s32(v0), v));
return vaddq_s32(v0, a0);
}

static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
const void *depth_image_data,
void *xyz_image_data)
{
float *x_tab = (float *)xy_tables->x_table;
float *y_tab = (float *)xy_tables->y_table;
const uint16_t *depth_image_data_uint16 = (const uint16_t *)depth_image_data;
int16_t *xyz_data_int16 = (int16_t *)xyz_image_data;
float32x4_t half = vdupq_n_f32(0.5f);

set_special_instruction_optimization("NEON");

for (int i = 0; i < xy_tables->width * xy_tables->height / 8; i++)
{
// 8 elements in 1 loop
int offset = i * 8;
float32x4_t x_tab_lo = vld1q_f32(x_tab + offset);
float32x4_t x_tab_hi = vld1q_f32(x_tab + offset + 4);
// equivalent to isnan
uint32x4_t valid_lo = vceqq_f32(x_tab_lo, x_tab_lo);
uint32x4_t valid_hi = vceqq_f32(x_tab_hi, x_tab_hi);
// each element in valid is a mask which corresponds to isnan
uint16x8_t valid = vcombine_u16(vmovn_u32(valid_lo), vmovn_u32(valid_hi));
uint16x8_t v_0 = vandq_u16(vld1q_u16(depth_image_data_uint16 + offset), valid);
// v_z corresponds to z in naive code
int16x8_t v_z = vreinterpretq_s16_u16(v_0);
// expand v_z to compute x and y
float32x4_t v_z_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_0)));
float32x4_t v_z_hi = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_0)));
// load x_table and y_table
float32x4_t t_x_lo = vld1q_f32(x_tab + offset);
float32x4_t t_x_hi = vld1q_f32(x_tab + offset + 4);
float32x4_t t_y_lo = vld1q_f32(y_tab + offset);
float32x4_t t_y_hi = vld1q_f32(y_tab + offset + 4);
// main computation of x and y
int32x4_t v_x_lo = neon_floor(vmlaq_f32(half, v_z_lo, t_x_lo));
int32x4_t v_x_hi = neon_floor(vmlaq_f32(half, v_z_hi, t_x_hi));
int32x4_t v_y_lo = neon_floor(vmlaq_f32(half, v_z_lo, t_y_lo));
int32x4_t v_y_hi = neon_floor(vmlaq_f32(half, v_z_hi, t_y_hi));
int16x8_t v_x = vcombine_s16(vmovn_s32(v_x_lo), vmovn_s32(v_x_hi));
int16x8_t v_y = vcombine_s16(vmovn_s32(v_y_lo), vmovn_s32(v_y_hi));
// use scatter store instruction
int16x8x3_t store;
store.val[0] = v_x; // x0 x1 .. x14 x15
store.val[1] = v_y; // y0 y1 .. y14 y15
store.val[2] = v_z; // z0 z1 .. z14 z15
// x0 y0 z0 x1 y1 z1 .. x15 y15 z15
vst3q_s16(xyz_data_int16 + offset * 3, store);
}
}

#else /* defined(K4A_USING_SSE) */

static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_tables,
const void *depth_image_data,
Expand All @@ -1110,6 +1198,8 @@ static void transformation_depth_to_xyz(k4a_transformation_xy_tables_t *xy_table
__m128 *y_table_m128 = (__m128 *)y_table;
__m128i *xyz_data_m128i = (__m128i *)xyz_image_data;

set_special_instruction_optimization("SSE");

const int16_t pos0 = 0x0100;
const int16_t pos1 = 0x0302;
const int16_t pos2 = 0x0504;
Expand Down
24 changes: 24 additions & 0 deletions tests/Transformation/transformation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ class transformation_ut : public ::testing::Test
ASSERT_EQ_FLT(A[2], B[2]) \
}

// Export function from transformation.c to snoop on the compiler setting used.
extern "C" char *transformation_get_instruction_type();

static k4a_transformation_image_descriptor_t image_get_descriptor(const k4a_image_t image)
{
k4a_transformation_image_descriptor_t descriptor;
Expand Down Expand Up @@ -405,6 +408,27 @@ TEST_F(transformation_ut, transformation_depth_image_to_point_cloud)
ASSERT_EQ(check_sum, reference_val);
}

{
// Are we compiled for the correct instruction type
#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_IX86)
#define SPECIAL_INSTRUCTION_OPTIMIZATION "SSE\0"
#elif defined(__aarch64__) || defined(_M_ARM64)
#define SPECIAL_INSTRUCTION_OPTIMIZATION "NEON"
#else
// Omit defining this when not SSE or NEON. Should result in a build break. We are either SSE or Neon.
//#define SPECIAL_INSTRUCTION_OPTIMIZATION "None"
#endif
char *compile_type = transformation_get_instruction_type();
ASSERT_NE(compile_type, (char *)nullptr);
ASSERT_NE(compile_type[0], '\0');
std::cout << "*** K4A Sensor SDK Compile type is: " << compile_type << " ***\n";
ASSERT_TRUE(memcmp(compile_type, SPECIAL_INSTRUCTION_OPTIMIZATION, strlen(compile_type)) == 0)
<< "Expecting " << SPECIAL_INSTRUCTION_OPTIMIZATION << " but compiled for " << compile_type << "\n";
ASSERT_TRUE(memcmp(compile_type, SPECIAL_INSTRUCTION_OPTIMIZATION, strlen(compile_type)) == 0)
<< "Expecting " << SPECIAL_INSTRUCTION_OPTIMIZATION << " but compiled for " << compile_type << "\n";
ASSERT_EQ(strlen(compile_type), strlen(SPECIAL_INSTRUCTION_OPTIMIZATION));
}

image_dec_ref(depth_image);
image_dec_ref(xyz_image);
transformation_destroy(transformation_handle);
Expand Down
8 changes: 4 additions & 4 deletions tests/multidevice/multidevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -523,13 +523,13 @@ TEST_F(multidevice_sync_ft, multi_sync_validation)
if (g_frame_rate != K4A_FRAMES_PER_SECOND_5 && g_frame_rate != K4A_FRAMES_PER_SECOND_15 &&
g_frame_rate != K4A_FRAMES_PER_SECOND_30)
{
#if defined(__amd64__) || defined(_M_AMD64) || defined(__i386__) || defined(_M_X86)
printf("Using 5, 15, or 30FPS for AMD64/x86 build\n");
int frame_rate_rand = (int)RAND_VALUE(0, 2);
#else
#if defined(__aarch64__) || defined(_M_ARM64)
// Jetson Nano can't handle 2 30FPS streams
printf("Using 5 or 15FPS for ARM64 build\n");
int frame_rate_rand = (int)RAND_VALUE(0, 1);
#else
printf("Using 5, 15, or 30FPS for AMD64/x86 build\n");
int frame_rate_rand = (int)RAND_VALUE(0, 2);
#endif
switch (frame_rate_rand)
{
Expand Down