arm_add_q7.c
1 /* 2 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 3 * 4 * SPDX-License-Identifier: Apache-2.0 5 * 6 * Licensed under the Apache License, Version 2.0 (the License); you may 7 * not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 /* ---------------------------------------------------------------------- 20 * Project: CMSIS DSP Library 21 * Title: arm_add_q7.c 22 * Description: Q7 vector addition 23 * 24 * $Date: 23 April 2021 25 * $Revision: V1.9.0 26 * 27 * Target Processor: Cortex-M and Cortex-A cores 28 * -------------------------------------------------------------------- */ 29 30 #include "dsp/basic_math_functions.h" 31 32 /** 33 @ingroup groupMath 34 */ 35 36 /** 37 @addtogroup BasicAdd 38 @{ 39 */ 40 41 /** 42 @brief Q7 vector addition. 43 @param[in] pSrcA points to the first input vector 44 @param[in] pSrcB points to the second input vector 45 @param[out] pDst points to the output vector 46 @param[in] blockSize number of samples in each vector 47 @return none 48 49 @par Scaling and Overflow Behavior 50 The function uses saturating arithmetic. 51 Results outside of the allowable Q7 range [0x80 0x7F] are saturated. 52 */ 53 54 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) 55 56 #include "arm_helium_utils.h" 57 58 void arm_add_q7( 59 const q7_t * pSrcA, 60 const q7_t * pSrcB, 61 q7_t * pDst, 62 uint32_t blockSize) 63 { 64 uint32_t blkCnt; /* loop counters */ 65 q7x16_t vecA; 66 q7x16_t vecB; 67 68 /* Compute 16 outputs at a time */ 69 blkCnt = blockSize >> 4; 70 while (blkCnt > 0U) 71 { 72 /* 73 * C = A + B 74 * Add and then store the results in the destination buffer. 75 */ 76 vecA = vld1q(pSrcA); 77 vecB = vld1q(pSrcB); 78 vst1q(pDst, vqaddq(vecA, vecB)); 79 /* 80 * Decrement the blockSize loop counter 81 */ 82 blkCnt--; 83 /* 84 * advance vector source and destination pointers 85 */ 86 pSrcA += 16; 87 pSrcB += 16; 88 pDst += 16; 89 } 90 /* 91 * tail 92 */ 93 blkCnt = blockSize & 0xF; 94 if (blkCnt > 0U) 95 { 96 mve_pred16_t p0 = vctp8q(blkCnt); 97 vecA = vld1q(pSrcA); 98 vecB = vld1q(pSrcB); 99 vstrbq_p(pDst, vqaddq(vecA, vecB), p0); 100 } 101 } 102 #else 103 void arm_add_q7( 104 const q7_t * pSrcA, 105 const q7_t * pSrcB, 106 q7_t * pDst, 107 uint32_t blockSize) 108 { 109 uint32_t blkCnt; /* Loop counter */ 110 111 #if defined (ARM_MATH_LOOPUNROLL) 112 113 /* Loop unrolling: Compute 4 outputs at a time */ 114 blkCnt = blockSize >> 2U; 115 116 while (blkCnt > 0U) 117 { 118 /* C = A + B */ 119 120 #if defined (ARM_MATH_DSP) 121 /* Add and store result in destination buffer (4 samples at a time). */ 122 write_q7x4_ia (&pDst, __QADD8 (read_q7x4_ia (&pSrcA), read_q7x4_ia (&pSrcB))); 123 #else 124 *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8); 125 *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8); 126 *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8); 127 *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8); 128 #endif 129 130 /* Decrement loop counter */ 131 blkCnt--; 132 } 133 134 /* Loop unrolling: Compute remaining outputs */ 135 blkCnt = blockSize % 0x4U; 136 137 #else 138 139 /* Initialize blkCnt with number of samples */ 140 blkCnt = blockSize; 141 142 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 143 144 while (blkCnt > 0U) 145 { 146 /* C = A + B */ 147 148 /* Add and store result in destination buffer. */ 149 *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ + *pSrcB++, 8); 150 151 /* Decrement loop counter */ 152 blkCnt--; 153 } 154 155 } 156 #endif /* defined(ARM_MATH_MVEI) */ 157 /** 158 @} end of BasicAdd group 159 */