From 8e65684294674586b0af2f11bc34f47fae2df38a Mon Sep 17 00:00:00 2001 From: Jun He Date: Mon, 15 Jan 2018 14:44:15 +0800 Subject: [PATCH] WT-3851: Optimize wt_compare* routines with NEON instructions for ARM platform __wt_lex_compare* support vector instructions to accelerate data comparison. For ARM platforms with NEON/ASIMD features NEON implementation could be added to improve performance. Change-Id: Ifeb2c46308937708bdf6a80416a3191ff499ae36 Signed-off-by: Jun He --- build_posix/configure.ac.in | 2 +- src/include/btree_cmp.i | 70 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in index c33d013..bf532f6 100644 --- a/build_posix/configure.ac.in +++ b/build_posix/configure.ac.in @@ -154,7 +154,7 @@ AM_TYPES AC_PROG_INSTALL -AC_CHECK_HEADERS([x86intrin.h]) +AC_CHECK_HEADERS([x86intrin.h arm_neon.h]) AC_CHECK_LIB(pthread, pthread_create) AC_CHECK_LIB(dl, dlopen) AC_CHECK_LIB(rt, sched_yield) diff --git a/src/include/btree_cmp.i b/src/include/btree_cmp.i index 9efbf8f..2443452 100644 --- a/src/include/btree_cmp.i +++ b/src/include/btree_cmp.i @@ -6,10 +6,14 @@ * See the file LICENSE for redistribution information. */ -#ifdef HAVE_X86INTRIN_H -#if !defined(_MSC_VER) && !defined(_lint) +#if defined(HAVE_X86INTRIN_H) || defined(HAVE_ARM_NEON_H) +#if defined(HAVE_X86INTRIN_H) && !defined(_MSC_VER) && !defined(_lint) #include #endif + +#ifdef HAVE_ARM_NEON_H +#include +#endif /* 16B alignment */ #define WT_ALIGNED_16(p) (((uintptr_t)(p) & 0x0f) == 0) #define WT_VECTOR_SIZE 16 /* chunk size */ @@ -70,6 +74,36 @@ __wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item) } len += remain; } +#elif defined(HAVE_ARM_NEON_H) + /* Use vector instructions if we'll execute at least 2 of them. */ + if (len >= WT_VECTOR_SIZE * 2) { + size_t remain; + uint8x16_t res_eq, u, t; + + remain = len % WT_VECTOR_SIZE; + len -= remain; + if (WT_ALIGNED_16(userp) && WT_ALIGNED_16(treep)) + for (; len > 0; + len -= WT_VECTOR_SIZE, + userp += WT_VECTOR_SIZE, treep += WT_VECTOR_SIZE) { + u = vld1q_u8(userp); + t = vld1q_u8(treep); + res_eq = vceqq_u8(u, t); + if (vminvq_u8(res_eq) != 255) + break; + } + else + for (; len > 0; + len -= WT_VECTOR_SIZE, + userp += WT_VECTOR_SIZE, treep += WT_VECTOR_SIZE) { + u = vld1q_u8(userp); + t = vld1q_u8(treep); + res_eq = vceqq_u8(u, t); + if (vminvq_u8(res_eq) != 255) + break; + } + len += remain; + } #endif /* * Use the non-vectorized version for the remaining bytes and for the @@ -158,6 +192,38 @@ __wt_lex_compare_skip( } len += remain; } +#elif defined(HAVE_ARM_NEON_H) + /* Use vector instructions if we'll execute at least 2 of them. */ + if (len >= WT_VECTOR_SIZE * 2) { + size_t remain; + uint8x16_t res_eq, u, t; + + remain = len % WT_VECTOR_SIZE; + len -= remain; + if (WT_ALIGNED_16(userp) && WT_ALIGNED_16(treep)) + for (; len > 0; + len -= WT_VECTOR_SIZE, + userp += WT_VECTOR_SIZE, treep += WT_VECTOR_SIZE, + *matchp += WT_VECTOR_SIZE) { + u = vld1q_u8(userp); + t = vld1q_u8(treep); + res_eq = vceqq_u8(u, t); + if (vminvq_u8(res_eq) != 255) + break; + } + else + for (; len > 0; + len -= WT_VECTOR_SIZE, + userp += WT_VECTOR_SIZE, treep += WT_VECTOR_SIZE, + *matchp += WT_VECTOR_SIZE) { + u = vld1q_u8(userp); + t = vld1q_u8(treep); + res_eq = vceqq_u8(u, t); + if (vminvq_u8(res_eq) != 255) + break; + } + len += remain; + } #endif /* * Use the non-vectorized version for the remaining bytes and for the -- 2.7.4