From 3afc1d1777e1c0dc0fe832db2c9746fb8a767fe3 Mon Sep 17 00:00:00 2001 From: Eric Date: Thu, 23 Jul 2020 02:16:46 +0200 Subject: [PATCH] libomsocoding: NEON viterbi acceleration configure flag required to enable this: --enable-neon Although autodetection according to __ARM_NEON would work because this is only defined if the fpu is neon neon-fp16 neon-vfpv3 neon-vfpv4 neon-fp-armv8 crypto-neon-fp-armv8 doing that would lead to a unknown performance impact, so it needs to be enabled manually. Speedup is about ~1.3-1.5 on a unspecified single core Cortex A9. This requires handling a special case for RACH with len 14 which is far too short for neon and would actually incur a performance penalty of 25%. Related: OS#4585 Change-Id: I58ff2cb4ce3514f43390ff0a2121f81e6a4983b5 --- configure.ac | 11 ++ src/Makefile.am | 5 + src/conv_acc.c | 28 ++++ src/conv_acc_neon.c | 110 ++++++++++++ src/conv_acc_neon_impl.h | 354 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 508 insertions(+) create mode 100644 src/conv_acc_neon.c create mode 100644 src/conv_acc_neon_impl.h diff --git a/configure.ac b/configure.ac index f69c78de3..2397b2f30 100644 --- a/configure.ac +++ b/configure.ac @@ -378,6 +378,17 @@ else AM_CONDITIONAL(HAVE_SSE4_1, false) fi +AC_ARG_ENABLE(neon, + [AS_HELP_STRING( + [--enable-neon], + [Enable NEON support] + )], + [neon=$enableval], [neon="no"]) +AC_DEFINE(HAVE_NEON,, +[Support ARM NEON instructions]) +AM_CONDITIONAL(HAVE_NEON, [test "x$neon" != "xno"]) + + OSMO_AC_CODE_COVERAGE dnl Check if the compiler supports specified GCC's built-in function diff --git a/src/Makefile.am b/src/Makefile.am index 16119d981..be097848f 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -48,6 +48,11 @@ endif endif endif +if HAVE_NEON +libosmocore_la_SOURCES += conv_acc_neon.c +# conv_acc_neon.lo : AM_CFLAGS += -mfpu=neon no, could as well be vfp with neon +endif + BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c EXTRA_DIST = conv_acc_sse_impl.h crcXXgen.c.tpl diff --git a/src/conv_acc.c b/src/conv_acc.c index c16e43643..0f6f7ca25 100644 --- a/src/conv_acc.c +++ b/src/conv_acc.c @@ -85,6 +85,11 @@ int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n); void osmo_conv_sse_avx_vdec_free(int16_t *ptr); #endif +#ifdef HAVE_NEON +int16_t *osmo_conv_neon_vdec_malloc(size_t n); +void osmo_conv_neon_vdec_free(int16_t *ptr); +#endif + /* Forward Metric Units */ void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out, int16_t *sums, int16_t *paths, int norm); @@ -129,6 +134,21 @@ void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *seq, const int16_t *out, int16_t *sums, int16_t *paths, int norm); #endif +#if defined(HAVE_NEON) +void osmo_conv_neon_metrics_k5_n2(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_neon_metrics_k5_n3(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_neon_metrics_k5_n4(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_neon_metrics_k7_n2(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_neon_metrics_k7_n3(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_neon_metrics_k7_n4(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +#endif + /* Trellis State * state - Internal lshift register value * prev - Register values of previous 0 and 1 states @@ -528,6 +548,12 @@ static int vdec_init(struct vdecoder *dec, const struct osmo_conv_code *code) if (dec->k == 5) { switch (dec->n) { case 2: +/* rach len 14 is too short for neon */ +#ifdef HAVE_NEON + if (code->len < 100) + dec->metric_func = osmo_conv_gen_metrics_k5_n2; + else +#endif dec->metric_func = osmo_conv_metrics_k5_n2; break; case 3: @@ -681,6 +707,8 @@ static void osmo_conv_init(void) } else { INIT_POINTERS(gen); } +#elif defined(HAVE_NEON) + INIT_POINTERS(neon); #else INIT_POINTERS(gen); #endif diff --git a/src/conv_acc_neon.c b/src/conv_acc_neon.c new file mode 100644 index 000000000..72449468e --- /dev/null +++ b/src/conv_acc_neon.c @@ -0,0 +1,110 @@ +/*! \file conv_acc_neon.c + * Accelerated Viterbi decoder implementation + * for architectures with only NEON available. */ +/* + * (C) 2020 by sysmocom - s.f.m.c. GmbH + * Author: Eric Wild + * + * All Rights Reserved + * + * SPDX-License-Identifier: GPL-2.0+ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include "config.h" + +#if defined(HAVE_NEON) +#include +#endif + +/* align req is 16 on android because google was confused, 8 on sane platforms */ +#define NEON_ALIGN 8 + +#include + +/* Aligned Memory Allocator + * NEON requires 8-byte memory alignment. We store relevant trellis values + * (accumulated sums, outputs, and path decisions) as 16 bit signed integers + * so the allocated memory is casted as such. + */ +__attribute__ ((visibility("hidden"))) +int16_t *osmo_conv_neon_vdec_malloc(size_t n) +{ + return (int16_t *) memalign(NEON_ALIGN, sizeof(int16_t) * n); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_vdec_free(int16_t *ptr) +{ + free(ptr); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k5_n2(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; + + _neon_metrics_k5_n2(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k5_n3(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], 0 }; + + _neon_metrics_k5_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k5_n4(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; + + _neon_metrics_k5_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k7_n2(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; + + _neon_metrics_k7_n2(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k7_n3(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], 0 }; + + _neon_metrics_k7_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k7_n4(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; + + _neon_metrics_k7_n4(_val, out, sums, paths, norm); +} diff --git a/src/conv_acc_neon_impl.h b/src/conv_acc_neon_impl.h new file mode 100644 index 000000000..4471127e8 --- /dev/null +++ b/src/conv_acc_neon_impl.h @@ -0,0 +1,354 @@ +/*! \file conv_acc_neon_impl.h + * Accelerated Viterbi decoder implementation: + * straight port of SSE to NEON based on Tom Tsous work */ +/* + * (C) 2020 by sysmocom - s.f.m.c. GmbH + * Author: Eric Wild + * + * All Rights Reserved + * + * SPDX-License-Identifier: GPL-2.0+ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* Some distributions (notably Alpine Linux) for some strange reason + * don't have this #define */ +#ifndef __always_inline +#define __always_inline inline __attribute__((always_inline)) +#endif + +#define NEON_BUTTERFLY(M0,M1,M2,M3,M4) \ +{ \ + M3 = vqaddq_s16(M0, M2); \ + M4 = vqsubq_s16(M1, M2); \ + M0 = vqsubq_s16(M0, M2); \ + M1 = vqaddq_s16(M1, M2); \ + M2 = vmaxq_s16(M3, M4); \ + M3 = vreinterpretq_s16_u16(vcgtq_s16(M3, M4)); \ + M4 = vmaxq_s16(M0, M1); \ + M1 = vreinterpretq_s16_u16(vcgtq_s16(M0, M1)); \ +} + +#define NEON_DEINTERLEAVE_K5(M0,M1,M2,M3) \ +{ \ + int16x8x2_t tmp; \ + tmp = vuzpq_s16(M0, M1); \ + M2 = tmp.val[0]; \ + M3 = tmp.val[1]; \ +} + +#define NEON_DEINTERLEAVE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11,M12,M13,M14,M15) \ +{ \ + int16x8x2_t tmp; \ + tmp = vuzpq_s16(M0, M1); \ + M8 = tmp.val[0]; M9 = tmp.val[1]; \ + tmp = vuzpq_s16(M2, M3); \ + M10 = tmp.val[0]; M11 = tmp.val[1]; \ + tmp = vuzpq_s16(M4, M5); \ + M12 = tmp.val[0]; M13 = tmp.val[1]; \ + tmp = vuzpq_s16(M6, M7); \ + M14 = tmp.val[0]; M15 = tmp.val[1]; \ +} + +#define NEON_BRANCH_METRIC_N2(M0,M1,M2,M3,M4,M6,M7) \ +{ \ + M0 = vmulq_s16(M4, M0); \ + M1 = vmulq_s16(M4, M1); \ + M2 = vmulq_s16(M4, M2); \ + M3 = vmulq_s16(M4, M3); \ + M6 = vcombine_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \ + M7 = vcombine_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \ +} + +#define NEON_BRANCH_METRIC_N4(M0,M1,M2,M3,M4,M5) \ +{ \ + M0 = vmulq_s16(M4, M0); \ + M1 = vmulq_s16(M4, M1); \ + M2 = vmulq_s16(M4, M2); \ + M3 = vmulq_s16(M4, M3); \ + int16x4_t t1 = vpadd_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \ + int16x4_t t2 = vpadd_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \ + M5 = vcombine_s16(t1, t2); \ +} + +#define NEON_NORMALIZE_K5(M0,M1,M2,M3) \ +{ \ + M2 = vminq_s16(M0, M1); \ + int16x4_t t = vpmin_s16(vget_low_s16(M2), vget_high_s16(M2)); \ + t = vpmin_s16(t, t); \ + t = vpmin_s16(t, t); \ + M2 = vdupq_lane_s16(t, 0); \ + M0 = vqsubq_s16(M0, M2); \ + M1 = vqsubq_s16(M1, M2); \ +} + +#define NEON_NORMALIZE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11) \ +{ \ + M8 = vminq_s16(M0, M1); \ + M9 = vminq_s16(M2, M3); \ + M10 = vminq_s16(M4, M5); \ + M11 = vminq_s16(M6, M7); \ + M8 = vminq_s16(M8, M9); \ + M10 = vminq_s16(M10, M11); \ + M8 = vminq_s16(M8, M10); \ + int16x4_t t = vpmin_s16(vget_low_s16(M8), vget_high_s16(M8)); \ + t = vpmin_s16(t, t); \ + t = vpmin_s16(t, t); \ + M8 = vdupq_lane_s16(t, 0); \ + M0 = vqsubq_s16(M0, M8); \ + M1 = vqsubq_s16(M1, M8); \ + M2 = vqsubq_s16(M2, M8); \ + M3 = vqsubq_s16(M3, M8); \ + M4 = vqsubq_s16(M4, M8); \ + M5 = vqsubq_s16(M5, M8); \ + M6 = vqsubq_s16(M6, M8); \ + M7 = vqsubq_s16(M7, M8); \ +} + +__always_inline void _neon_metrics_k5_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, + int norm) +{ + int16_t *__restrict out = __builtin_assume_aligned(outa, 8); + int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8); + int16x8_t m0, m1, m2, m3, m4, m5, m6; + int16x4_t input; + + /* (BMU) Load and expand 8-bit input out to 16-bits */ + input = vld1_s16(val); + m2 = vcombine_s16(input, input); + + /* (BMU) Load and compute branch metrics */ + m0 = vld1q_s16(&out[0]); + m1 = vld1q_s16(&out[8]); + + m0 = vmulq_s16(m2, m0); + m1 = vmulq_s16(m2, m1); + m2 = vcombine_s16(vpadd_s16(vget_low_s16(m0), vget_high_s16(m0)), + vpadd_s16(vget_low_s16(m1), vget_high_s16(m1))); + + /* (PMU) Load accumulated path matrics */ + m0 = vld1q_s16(&sums[0]); + m1 = vld1q_s16(&sums[8]); + + NEON_DEINTERLEAVE_K5(m0, m1, m3, m4) + + /* (PMU) Butterflies: 0-7 */ + NEON_BUTTERFLY(m3, m4, m2, m5, m6) + + if (norm) + NEON_NORMALIZE_K5(m2, m6, m0, m1) + + vst1q_s16(&sums[0], m2); + vst1q_s16(&sums[8], m6); + vst1q_s16(&paths[0], m5); + vst1q_s16(&paths[8], m4); +} + +__always_inline void _neon_metrics_k5_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, + int norm) +{ + int16_t *__restrict out = __builtin_assume_aligned(outa, 8); + int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8); + int16x8_t m0, m1, m2, m3, m4, m5, m6; + int16x4_t input; + + /* (BMU) Load and expand 8-bit input out to 16-bits */ + input = vld1_s16(val); + m4 = vcombine_s16(input, input); + + /* (BMU) Load and compute branch metrics */ + m0 = vld1q_s16(&out[0]); + m1 = vld1q_s16(&out[8]); + m2 = vld1q_s16(&out[16]); + m3 = vld1q_s16(&out[24]); + + NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2) + + /* (PMU) Load accumulated path matrics */ + m0 = vld1q_s16(&sums[0]); + m1 = vld1q_s16(&sums[8]); + + NEON_DEINTERLEAVE_K5(m0, m1, m3, m4) + + /* (PMU) Butterflies: 0-7 */ + NEON_BUTTERFLY(m3, m4, m2, m5, m6) + + if (norm) + NEON_NORMALIZE_K5(m2, m6, m0, m1) + + vst1q_s16(&sums[0], m2); + vst1q_s16(&sums[8], m6); + vst1q_s16(&paths[0], m5); + vst1q_s16(&paths[8], m4); +} + +__always_inline static void _neon_metrics_k7_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, + int norm) +{ + int16_t *__restrict out = __builtin_assume_aligned(outa, 8); + int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8); + int16x8_t m0, m1, m2, m3, m4, m5, m6, m7; + int16x8_t m8, m9, m10, m11, m12, m13, m14, m15; + int16x4_t input; + + /* (PMU) Load accumulated path matrics */ + m0 = vld1q_s16(&sums[0]); + m1 = vld1q_s16(&sums[8]); + m2 = vld1q_s16(&sums[16]); + m3 = vld1q_s16(&sums[24]); + m4 = vld1q_s16(&sums[32]); + m5 = vld1q_s16(&sums[40]); + m6 = vld1q_s16(&sums[48]); + m7 = vld1q_s16(&sums[56]); + + /* (PMU) Deinterleave into even and odd packed registers */ + NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15) + + /* (BMU) Load and expand 8-bit input out to 16-bits */ + input = vld1_s16(val); + m7 = vcombine_s16(input, input); + + /* (BMU) Load and compute branch metrics */ + m0 = vld1q_s16(&out[0]); + m1 = vld1q_s16(&out[8]); + m2 = vld1q_s16(&out[16]); + m3 = vld1q_s16(&out[24]); + + NEON_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5) + + m0 = vld1q_s16(&out[32]); + m1 = vld1q_s16(&out[40]); + m2 = vld1q_s16(&out[48]); + m3 = vld1q_s16(&out[56]); + + NEON_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7) + + /* (PMU) Butterflies: 0-15 */ + NEON_BUTTERFLY(m8, m9, m4, m0, m1) + NEON_BUTTERFLY(m10, m11, m5, m2, m3) + + vst1q_s16(&paths[0], m0); + vst1q_s16(&paths[8], m2); + vst1q_s16(&paths[32], m9); + vst1q_s16(&paths[40], m11); + + /* (PMU) Butterflies: 17-31 */ + NEON_BUTTERFLY(m12, m13, m6, m0, m2) + NEON_BUTTERFLY(m14, m15, m7, m9, m11) + + vst1q_s16(&paths[16], m0); + vst1q_s16(&paths[24], m9); + vst1q_s16(&paths[48], m13); + vst1q_s16(&paths[56], m15); + + if (norm) + NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10) + + vst1q_s16(&sums[0], m4); + vst1q_s16(&sums[8], m5); + vst1q_s16(&sums[16], m6); + vst1q_s16(&sums[24], m7); + vst1q_s16(&sums[32], m1); + vst1q_s16(&sums[40], m3); + vst1q_s16(&sums[48], m2); + vst1q_s16(&sums[56], m11); +} + +__always_inline static void _neon_metrics_k7_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, + int norm) +{ + int16_t *__restrict out = __builtin_assume_aligned(outa, 8); + int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8); + int16x8_t m0, m1, m2, m3, m4, m5, m6, m7; + int16x8_t m8, m9, m10, m11, m12, m13, m14, m15; + int16x4_t input; + + /* (PMU) Load accumulated path matrics */ + m0 = vld1q_s16(&sums[0]); + m1 = vld1q_s16(&sums[8]); + m2 = vld1q_s16(&sums[16]); + m3 = vld1q_s16(&sums[24]); + m4 = vld1q_s16(&sums[32]); + m5 = vld1q_s16(&sums[40]); + m6 = vld1q_s16(&sums[48]); + m7 = vld1q_s16(&sums[56]); + + /* (PMU) Deinterleave into even and odd packed registers */ + NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15) + + /* (BMU) Load and expand 8-bit input out to 16-bits */ + input = vld1_s16(val); + m7 = vcombine_s16(input, input); + + /* (BMU) Load and compute branch metrics */ + m0 = vld1q_s16(&out[0]); + m1 = vld1q_s16(&out[8]); + m2 = vld1q_s16(&out[16]); + m3 = vld1q_s16(&out[24]); + + NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4) + + m0 = vld1q_s16(&out[32]); + m1 = vld1q_s16(&out[40]); + m2 = vld1q_s16(&out[48]); + m3 = vld1q_s16(&out[56]); + + NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5) + + m0 = vld1q_s16(&out[64]); + m1 = vld1q_s16(&out[72]); + m2 = vld1q_s16(&out[80]); + m3 = vld1q_s16(&out[88]); + + NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6) + + m0 = vld1q_s16(&out[96]); + m1 = vld1q_s16(&out[104]); + m2 = vld1q_s16(&out[112]); + m3 = vld1q_s16(&out[120]); + + NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7) + + /* (PMU) Butterflies: 0-15 */ + NEON_BUTTERFLY(m8, m9, m4, m0, m1) + NEON_BUTTERFLY(m10, m11, m5, m2, m3) + + vst1q_s16(&paths[0], m0); + vst1q_s16(&paths[8], m2); + vst1q_s16(&paths[32], m9); + vst1q_s16(&paths[40], m11); + + /* (PMU) Butterflies: 17-31 */ + NEON_BUTTERFLY(m12, m13, m6, m0, m2) + NEON_BUTTERFLY(m14, m15, m7, m9, m11) + + vst1q_s16(&paths[16], m0); + vst1q_s16(&paths[24], m9); + vst1q_s16(&paths[48], m13); + vst1q_s16(&paths[56], m15); + + if (norm) + NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10) + + vst1q_s16(&sums[0], m4); + vst1q_s16(&sums[8], m5); + vst1q_s16(&sums[16], m6); + vst1q_s16(&sums[24], m7); + vst1q_s16(&sums[32], m1); + vst1q_s16(&sums[40], m3); + vst1q_s16(&sums[48], m2); + vst1q_s16(&sums[56], m11); +}