| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372 | /****************************************************************************** * @file     arm_vec_math.h * @brief    Public header file for CMSIS DSP Library * @version  V1.7.0 * @date     15. October 2019 ******************************************************************************//* * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the License); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an AS IS BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */#ifndef _ARM_VEC_MATH_H#define _ARM_VEC_MATH_H#include "arm_math.h"#include "arm_common_tables.h"#include "arm_helium_utils.h"#ifdef   __cplusplusextern "C"{#endif#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)#define INV_NEWTON_INIT_F32         0x7EF127EAstatic const float32_t __logf_rng_f32=0.693147180f;/* fast inverse approximation (3x newton) */__STATIC_INLINE f32x4_t vrecip_medprec_f32(    f32x4_t x){    q31x4_t         m;    f32x4_t         b;    any32x4_t       xinv;    f32x4_t         ax = vabsq(x);    xinv.f = ax;    m = 0x3F800000 - (xinv.i & 0x7F800000);    xinv.i = xinv.i + m;    xinv.f = 1.41176471f - 0.47058824f * xinv.f;    xinv.i = xinv.i + m;    b = 2.0f - xinv.f * ax;    xinv.f = xinv.f * b;    b = 2.0f - xinv.f * ax;    xinv.f = xinv.f * b;    b = 2.0f - xinv.f * ax;    xinv.f = xinv.f * b;    xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));    /*     * restore sign     */    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));    return xinv.f;}/* fast inverse approximation (4x newton) */__STATIC_INLINE f32x4_t vrecip_hiprec_f32(    f32x4_t x){    q31x4_t         m;    f32x4_t         b;    any32x4_t       xinv;    f32x4_t         ax = vabsq(x);    xinv.f = ax;    m = 0x3F800000 - (xinv.i & 0x7F800000);    xinv.i = xinv.i + m;    xinv.f = 1.41176471f - 0.47058824f * xinv.f;    xinv.i = xinv.i + m;    b = 2.0f - xinv.f * ax;    xinv.f = xinv.f * b;    b = 2.0f - xinv.f * ax;    xinv.f = xinv.f * b;    b = 2.0f - xinv.f * ax;    xinv.f = xinv.f * b;    b = 2.0f - xinv.f * ax;    xinv.f = xinv.f * b;    xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));    /*     * restore sign     */    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));    return xinv.f;}__STATIC_INLINE f32x4_t vdiv_f32(    f32x4_t num, f32x4_t den){    return vmulq(num, vrecip_hiprec_f32(den));}/**  @brief         Single-precision taylor dev.  @param[in]     x              f32 quad vector input  @param[in]     coeffs         f32 quad vector coeffs  @return        destination    f32 quad vector */__STATIC_INLINE f32x4_t vtaylor_polyq_f32(        f32x4_t           x,        const float32_t * coeffs){    f32x4_t         A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);    f32x4_t         B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);    f32x4_t         C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);    f32x4_t         D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);    f32x4_t         x2 = vmulq(x, x);    f32x4_t         x4 = vmulq(x2, x2);    f32x4_t         res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);    return res;}__STATIC_INLINE f32x4_t vmant_exp_f32(    f32x4_t     x,    int32x4_t * e){    any32x4_t       r;    int32x4_t       n;    r.f = x;    n = r.i >> 23;    n = n - 127;    r.i = r.i - (n << 23);    *e = n;    return r.f;}__STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn){    q31x4_t         vecExpUnBiased;    f32x4_t         vecTmpFlt0, vecTmpFlt1;    f32x4_t         vecAcc0, vecAcc1, vecAcc2, vecAcc3;    f32x4_t         vecExpUnBiasedFlt;    /*     * extract exponent     */    vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);    vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;    /*     * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);     */    vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);    vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);    /*     * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);     */    vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);    vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);    /*     * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);     */    vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);    vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);    /*     * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);     */    vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);    vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);    /*     * a = a + b * xx;     */    vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);    /*     * c = c + d * xx;     */    vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);    /*     * xx = xx * xx;     */    vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;    vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);    /*     * r.f = a + c * xx;     */    vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);    /*     * add exponent     * r.f = r.f + ((float32_t) m) * __logf_rng_f32;     */    vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);    // set log0 down to -inf    vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));    return vecAcc0;}__STATIC_INLINE f32x4_t vexpq_f32(    f32x4_t x){    // Perform range reduction [-log(2),log(2)]    int32x4_t       m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));    f32x4_t         val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));    // Polynomial Approximation    f32x4_t         poly = vtaylor_polyq_f32(val, exp_tab);    // Reconstruct    poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));    poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));    return poly;}__STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb){    f32x4_t         r = x;    nb--;    while (nb > 0) {        r = vmulq(r, x);        nb--;    }    return (r);}__STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn){    f32x4_t     vecSx, vecW, vecTmp;    any32x4_t   v;    vecSx = vabsq(vecIn);    v.f = vecIn;    v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);    vecW = vmulq(vecSx, v.f);    // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));    vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);    vecTmp = vfmasq(vecW, vecTmp, -28.0f);    vecTmp = vfmasq(vecW, vecTmp, 56.0f);    vecTmp = vfmasq(vecW, vecTmp, -70.0f);    vecTmp = vfmasq(vecW, vecTmp, 56.0f);    vecTmp = vfmasq(vecW, vecTmp, -28.0f);    vecTmp = vfmasq(vecW, vecTmp, 8.0f);    v.f = vmulq(v.f,  vecTmp);    v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));    /*     * restore sign     */    v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));    return v.f;}__STATIC_INLINE f32x4_t vtanhq_f32(    f32x4_t val){    f32x4_t         x =        vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));    f32x4_t         exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));    f32x4_t         num = vsubq_n_f32(exp2x, 1.f);    f32x4_t         den = vaddq_n_f32(exp2x, 1.f);    f32x4_t         tanh = vmulq_f32(num, vrecip_f32(den));    return tanh;}__STATIC_INLINE f32x4_t vpowq_f32(    f32x4_t val,    f32x4_t n){    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));}#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))#endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)#include "NEMath.h"/** * @brief Vectorized integer exponentiation * @param[in]    x           value * @param[in]    nb          integer exponent >= 1 * @return x^nb * */__STATIC_INLINE  float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb){    float32x4_t r = x;    nb --;    while(nb > 0)    {        r = vmulq_f32(r , x);        nb--;    }    return(r);}__STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t  x){    float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));    float32x4_t e = vrsqrteq_f32(x1);    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);    return vmulq_f32(x, e);}__STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec){    float32x4_t tempF;    int32x4_t tempHI,tempLO;    tempLO = vmovl_s16(vget_low_s16(vec));    tempF = vcvtq_n_f32_s32(tempLO,15);    tempF = __arm_vec_sqrt_f32_neon(tempF);    tempLO = vcvtq_n_s32_f32(tempF,15);    tempHI = vmovl_s16(vget_high_s16(vec));    tempF = vcvtq_n_f32_s32(tempHI,15);    tempF = __arm_vec_sqrt_f32_neon(tempF);    tempHI = vcvtq_n_s32_f32(tempF,15);    return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));}__STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec){  float32x4_t temp;  temp = vcvtq_n_f32_s32(vec,31);  temp = __arm_vec_sqrt_f32_neon(temp);  return(vcvtq_n_s32_f32(temp,31));}#endif /*  (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */#ifdef   __cplusplus}#endif#endif /* _ARM_VEC_MATH_H *//** * * End of file. */
 |