arm_bitonic_sort_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_bitonic_sort_f32.c 4 * Description: Floating point bitonic sort 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/support_functions.h" 30 #include "arm_sorting.h" 31 32 33 #if !defined(ARM_MATH_NEON) 34 35 static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir) 36 { 37 uint32_t step; 38 uint32_t k, j; 39 float32_t *leftPtr, *rightPtr; 40 float32_t temp; 41 42 step = n>>1; 43 leftPtr = pSrc; 44 rightPtr = pSrc+n-1; 45 46 for(k=0; k<step; k++) 47 { 48 if(dir == (*leftPtr > *rightPtr)) 49 { 50 // Swap 51 temp=*leftPtr; 52 *leftPtr=*rightPtr; 53 *rightPtr=temp; 54 } 55 56 leftPtr++; // Move right 57 rightPtr--; // Move left 58 } 59 60 // Merge 61 for(step=(n>>2); step>0; step/=2) 62 { 63 for(j=0; j<n; j=j+step*2) 64 { 65 leftPtr = pSrc+j; 66 rightPtr = pSrc+j+step; 67 68 for(k=0; k<step; k++) 69 { 70 if(*leftPtr > *rightPtr) 71 { 72 // Swap 73 temp=*leftPtr; 74 *leftPtr=*rightPtr; 75 *rightPtr=temp; 76 } 77 78 leftPtr++; 79 rightPtr++; 80 } 81 } 82 } 83 } 84 #endif 85 86 #if defined(ARM_MATH_NEON) 87 88 89 static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir) 90 { 91 /* Start with two vectors: 92 * +---+---+---+---+ 93 * | a | b | c | d | 94 * +---+---+---+---+ 95 * +---+---+---+---+ 96 * | e | f | g | h | 97 * +---+---+---+---+ 98 * All the elements of the first are guaranteed to be less than or equal to 99 * all of the elements in the second, and both vectors are bitonic. 100 * We need to perform these operations to completely sort both lists: 101 * vminmax([abcd],[efgh]) 102 * vminmax([acbd],[egfh]) 103 */ 104 vtrn128_64q(a, b); 105 /* +---+---+---+---+ 106 * | a | b | e | f | 107 * +---+---+---+---+ 108 * +---+---+---+---+ 109 * | c | d | g | h | 110 * +---+---+---+---+ 111 */ 112 if(dir) 113 vminmaxq(a, b); 114 else 115 vminmaxq(b, a); 116 117 vtrn128_32q(a, b); 118 /* +---+---+---+---+ 119 * | a | c | e | g | 120 * +---+---+---+---+ 121 * +---+---+---+---+ 122 * | b | d | f | h | 123 * +---+---+---+---+ 124 */ 125 if(dir) 126 vminmaxq(a, b); 127 else 128 vminmaxq(b, a); 129 130 return vzipq_f32(a, b); 131 } 132 133 134 static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir) 135 { 136 /* a and b are guaranteed to be bitonic */ 137 // Reverse the element of the second vector 138 b = vrev128q_f32(b); 139 140 // Compare the two vectors 141 if(dir) 142 vminmaxq(a, b); 143 else 144 vminmaxq(b, a); 145 146 // Merge the two vectors 147 float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir); 148 149 return ab; 150 } 151 152 static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir) 153 { 154 /* Start with two vectors: 155 * +---+---+---+---+---+---+---+---+ 156 * | a | b | c | d | e | f | g | h | 157 * +---+---+---+---+---+---+---+---+ 158 * +---+---+---+---+---+---+---+---+ 159 * | i | j | k | l | m | n | o | p | 160 * +---+---+---+---+---+---+---+---+ 161 * All the elements of the first are guaranteed to be less than or equal to 162 * all of the elements in the second, and both vectors are bitonic. 163 * We need to perform these operations to completely sort both lists: 164 * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop]) 165 * vminmax([abef],[cdgh]) vminmax([ijmn],[klop]) 166 * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp]) 167 */ 168 169 vtrn256_128q(a, b); 170 /* +---+---+---+---+---+---+---+---+ 171 * | a | b | c | d | i | j | k | l | 172 * +---+---+---+---+---+---+---+---+ 173 * +---+---+---+---+---+---+---+---+ 174 * | e | f | g | h | m | n | o | p | 175 * +---+---+---+---+---+---+---+---+ 176 */ 177 if(dir) 178 vminmax256q(a, b); 179 else 180 vminmax256q(b, a); 181 182 vtrn256_64q(a, b); 183 184 /* +---+---+---+---+---+---+---+---+ 185 * | a | b | e | f | i | j | m | n | 186 * +---+---+---+---+---+---+---+---+ 187 * +---+---+---+---+---+---+---+---+ 188 * | c | d | g | h | k | l | o | p | 189 * +---+---+---+---+---+---+---+---+ 190 */ 191 if(dir) 192 vminmax256q(a, b); 193 else 194 vminmax256q(b, a); 195 196 vtrn256_32q(a, b); 197 /* We now have: 198 * +---+---+---+---+---+---+---+---+ 199 * | a | c | e | g | i | k | m | o | 200 * +---+---+---+---+---+---+---+---+ 201 * +---+---+---+---+---+---+---+---+ 202 * | b | d | f | h | j | l | n | p | 203 * +---+---+---+---+---+---+---+---+ 204 */ 205 if(dir) 206 vminmax256q(a, b); 207 else 208 vminmax256q(b, a); 209 210 float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]); 211 float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]); 212 213 vst1q_f32(pOut, out1.val[0]); 214 vst1q_f32(pOut+4, out1.val[1]); 215 vst1q_f32(pOut+8, out2.val[0]); 216 vst1q_f32(pOut+12, out2.val[1]); 217 } 218 219 static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir) 220 { 221 // Merge two preordered float32x4x2_t 222 vrev256q_f32(b); 223 224 if(dir) 225 vminmax256q(a, b); 226 else 227 vminmax256q(b, a); 228 229 arm_bitonic_resort_16_f32(pOut, a, b, dir); 230 } 231 232 static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir) 233 { 234 float32x4_t a; 235 float32x4_t b; 236 float32x4_t c; 237 float32x4_t d; 238 239 // Load 16 samples 240 a = vld1q_f32(pSrc); 241 b = vld1q_f32(pSrc+4); 242 c = vld1q_f32(pSrc+8); 243 d = vld1q_f32(pSrc+12); 244 245 // Bitonic sorting network for 4 samples x 4 times 246 if(dir) 247 { 248 vminmaxq(a, b); 249 vminmaxq(c, d); 250 251 vminmaxq(a, d); 252 vminmaxq(b, c); 253 254 vminmaxq(a, b); 255 vminmaxq(c, d); 256 } 257 else 258 { 259 vminmaxq(b, a); 260 vminmaxq(d, c); 261 262 vminmaxq(d, a); 263 vminmaxq(c, b); 264 265 vminmaxq(b, a); 266 vminmaxq(d, c); 267 } 268 269 float32x4x2_t ab = vtrnq_f32 (a, b); 270 float32x4x2_t cd = vtrnq_f32 (c, d); 271 272 // Transpose 4 ordered arrays of 4 samples 273 a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0])); 274 b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1])); 275 c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0])); 276 d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1])); 277 278 // Merge pairs of arrays of 4 samples 279 ab = arm_bitonic_merge_8_f32(a, b, dir); 280 cd = arm_bitonic_merge_8_f32(c, d, dir); 281 282 // Merge arrays of 8 samples 283 arm_bitonic_merge_16_f32(pDst, ab, cd, dir); 284 } 285 286 287 288 289 290 static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir) 291 { 292 //Compare 293 if(dir) 294 { 295 vminmax256q(ab1, cd1); 296 vminmax256q(ab2, cd2); 297 } 298 else 299 { 300 vminmax256q(cd1, ab1); 301 vminmax256q(cd2, ab2); 302 } 303 //Transpose 256 304 float32x4_t temp; 305 306 temp = ab2.val[0]; 307 ab2.val[0] = cd1.val[0]; 308 cd1.val[0] = temp; 309 temp = ab2.val[1]; 310 ab2.val[1] = cd1.val[1]; 311 cd1.val[1] = temp; 312 313 //Compare 314 if(dir) 315 { 316 vminmax256q(ab1, cd1); 317 vminmax256q(ab2, cd2); 318 } 319 else 320 { 321 vminmax256q(cd1, ab1); 322 vminmax256q(cd2, ab2); 323 } 324 325 //Transpose 128 326 arm_bitonic_merge_16_f32(pSrc+0, ab1, cd1, dir); 327 arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir); 328 } 329 330 static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir) 331 { 332 float32x4x2_t ab1, ab2, ab3, ab4; 333 float32x4x2_t cd1, cd2, cd3, cd4; 334 335 //Load and reverse second array 336 ab1.val[0] = vld1q_f32(pSrc+0 ); 337 ab1.val[1] = vld1q_f32(pSrc+4 ); 338 ab2.val[0] = vld1q_f32(pSrc+8 ); 339 ab2.val[1] = vld1q_f32(pSrc+12); 340 ab3.val[0] = vld1q_f32(pSrc+16); 341 ab3.val[1] = vld1q_f32(pSrc+20); 342 ab4.val[0] = vld1q_f32(pSrc+24); 343 ab4.val[1] = vld1q_f32(pSrc+28); 344 345 vldrev128q_f32(cd4.val[1], pSrc+32); 346 vldrev128q_f32(cd4.val[0], pSrc+36); 347 vldrev128q_f32(cd3.val[1], pSrc+40); 348 vldrev128q_f32(cd3.val[0], pSrc+44); 349 vldrev128q_f32(cd2.val[1], pSrc+48); 350 vldrev128q_f32(cd2.val[0], pSrc+52); 351 vldrev128q_f32(cd1.val[1], pSrc+56); 352 vldrev128q_f32(cd1.val[0], pSrc+60); 353 354 //Compare 355 if(dir) 356 { 357 vminmax256q(ab1, cd1); 358 vminmax256q(ab2, cd2); 359 vminmax256q(ab3, cd3); 360 vminmax256q(ab4, cd4); 361 } 362 else 363 { 364 vminmax256q(cd1, ab1); 365 vminmax256q(cd2, ab2); 366 vminmax256q(cd3, ab3); 367 vminmax256q(cd4, ab4); 368 } 369 370 //Transpose 512 371 float32x4_t temp; 372 373 temp = ab3.val[0]; 374 ab3.val[0] = cd1.val[0]; 375 cd1.val[0] = temp; 376 temp = ab3.val[1]; 377 ab3.val[1] = cd1.val[1]; 378 cd1.val[1] = temp; 379 temp = ab4.val[0]; 380 ab4.val[0] = cd2.val[0]; 381 cd2.val[0] = temp; 382 temp = ab4.val[1]; 383 ab4.val[1] = cd2.val[1]; 384 cd2.val[1] = temp; 385 386 //Compare 387 if(dir) 388 { 389 vminmax256q(ab1, cd1); 390 vminmax256q(ab2, cd2); 391 vminmax256q(ab3, cd3); 392 vminmax256q(ab4, cd4); 393 } 394 else 395 { 396 vminmax256q(cd1, ab1); 397 vminmax256q(cd2, ab2); 398 vminmax256q(cd3, ab3); 399 vminmax256q(cd4, ab4); 400 } 401 402 //Transpose 256 403 arm_bitonic_merge_32_f32(pSrc+0, ab1, ab2, cd1, cd2, dir); 404 arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir); 405 } 406 407 static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir) 408 { 409 float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8; 410 float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8; 411 412 //Load and reverse second array 413 ab1.val[0] = vld1q_f32(pSrc+0 ); 414 ab1.val[1] = vld1q_f32(pSrc+4 ); 415 ab2.val[0] = vld1q_f32(pSrc+8 ); 416 ab2.val[1] = vld1q_f32(pSrc+12); 417 ab3.val[0] = vld1q_f32(pSrc+16); 418 ab3.val[1] = vld1q_f32(pSrc+20); 419 ab4.val[0] = vld1q_f32(pSrc+24); 420 ab4.val[1] = vld1q_f32(pSrc+28); 421 ab5.val[0] = vld1q_f32(pSrc+32); 422 ab5.val[1] = vld1q_f32(pSrc+36); 423 ab6.val[0] = vld1q_f32(pSrc+40); 424 ab6.val[1] = vld1q_f32(pSrc+44); 425 ab7.val[0] = vld1q_f32(pSrc+48); 426 ab7.val[1] = vld1q_f32(pSrc+52); 427 ab8.val[0] = vld1q_f32(pSrc+56); 428 ab8.val[1] = vld1q_f32(pSrc+60); 429 430 vldrev128q_f32(cd8.val[1], pSrc+64); 431 vldrev128q_f32(cd8.val[0], pSrc+68); 432 vldrev128q_f32(cd7.val[1], pSrc+72); 433 vldrev128q_f32(cd7.val[0], pSrc+76); 434 vldrev128q_f32(cd6.val[1], pSrc+80); 435 vldrev128q_f32(cd6.val[0], pSrc+84); 436 vldrev128q_f32(cd5.val[1], pSrc+88); 437 vldrev128q_f32(cd5.val[0], pSrc+92); 438 vldrev128q_f32(cd4.val[1], pSrc+96); 439 vldrev128q_f32(cd4.val[0], pSrc+100); 440 vldrev128q_f32(cd3.val[1], pSrc+104); 441 vldrev128q_f32(cd3.val[0], pSrc+108); 442 vldrev128q_f32(cd2.val[1], pSrc+112); 443 vldrev128q_f32(cd2.val[0], pSrc+116); 444 vldrev128q_f32(cd1.val[1], pSrc+120); 445 vldrev128q_f32(cd1.val[0], pSrc+124); 446 447 //Compare 448 if(dir) 449 { 450 vminmax256q(ab1, cd1); 451 vminmax256q(ab2, cd2); 452 vminmax256q(ab3, cd3); 453 vminmax256q(ab4, cd4); 454 vminmax256q(ab5, cd5); 455 vminmax256q(ab6, cd6); 456 vminmax256q(ab7, cd7); 457 vminmax256q(ab8, cd8); 458 } 459 else 460 { 461 vminmax256q(cd1, ab1); 462 vminmax256q(cd2, ab2); 463 vminmax256q(cd3, ab3); 464 vminmax256q(cd4, ab4); 465 vminmax256q(cd5, ab5); 466 vminmax256q(cd6, ab6); 467 vminmax256q(cd7, ab7); 468 vminmax256q(cd8, ab8); 469 } 470 471 //Transpose 472 float32x4_t temp; 473 474 temp = ab5.val[0]; 475 ab5.val[0] = cd1.val[0]; 476 cd1.val[0] = temp; 477 temp = ab5.val[1]; 478 ab5.val[1] = cd1.val[1]; 479 cd1.val[1] = temp; 480 temp = ab6.val[0]; 481 ab6.val[0] = cd2.val[0]; 482 cd2.val[0] = temp; 483 temp = ab6.val[1]; 484 ab6.val[1] = cd2.val[1]; 485 cd2.val[1] = temp; 486 temp = ab7.val[0]; 487 ab7.val[0] = cd3.val[0]; 488 cd3.val[0] = temp; 489 temp = ab7.val[1]; 490 ab7.val[1] = cd3.val[1]; 491 cd3.val[1] = temp; 492 temp = ab8.val[0]; 493 ab8.val[0] = cd4.val[0]; 494 cd4.val[0] = temp; 495 temp = ab8.val[1]; 496 ab8.val[1] = cd4.val[1]; 497 cd4.val[1] = temp; 498 499 //Compare 500 if(dir) 501 { 502 vminmax256q(ab1, cd1); 503 vminmax256q(ab2, cd2); 504 vminmax256q(ab3, cd3); 505 vminmax256q(ab4, cd4); 506 vminmax256q(ab5, cd5); 507 vminmax256q(ab6, cd6); 508 vminmax256q(ab7, cd7); 509 vminmax256q(ab8, cd8); 510 } 511 else 512 { 513 vminmax256q(cd1, ab1); 514 vminmax256q(cd2, ab2); 515 vminmax256q(cd3, ab3); 516 vminmax256q(cd4, ab4); 517 vminmax256q(cd5, ab5); 518 vminmax256q(cd6, ab6); 519 vminmax256q(cd7, ab7); 520 vminmax256q(cd8, ab8); 521 } 522 523 vst1q_f32(pSrc, ab1.val[0]); 524 vst1q_f32(pSrc+4, ab1.val[1]); 525 vst1q_f32(pSrc+8, ab2.val[0]); 526 vst1q_f32(pSrc+12, ab2.val[1]); 527 vst1q_f32(pSrc+16, ab3.val[0]); 528 vst1q_f32(pSrc+20, ab3.val[1]); 529 vst1q_f32(pSrc+24, ab4.val[0]); 530 vst1q_f32(pSrc+28, ab4.val[1]); 531 vst1q_f32(pSrc+32, cd1.val[0]); 532 vst1q_f32(pSrc+36, cd1.val[1]); 533 vst1q_f32(pSrc+40, cd2.val[0]); 534 vst1q_f32(pSrc+44, cd2.val[1]); 535 vst1q_f32(pSrc+48, cd3.val[0]); 536 vst1q_f32(pSrc+52, cd3.val[1]); 537 vst1q_f32(pSrc+56, cd4.val[0]); 538 vst1q_f32(pSrc+60, cd4.val[1]); 539 vst1q_f32(pSrc+64, ab5.val[0]); 540 vst1q_f32(pSrc+68, ab5.val[1]); 541 vst1q_f32(pSrc+72, ab6.val[0]); 542 vst1q_f32(pSrc+76, ab6.val[1]); 543 vst1q_f32(pSrc+80, ab7.val[0]); 544 vst1q_f32(pSrc+84, ab7.val[1]); 545 vst1q_f32(pSrc+88, ab8.val[0]); 546 vst1q_f32(pSrc+92, ab8.val[1]); 547 vst1q_f32(pSrc+96, cd5.val[0]); 548 vst1q_f32(pSrc+100, cd5.val[1]); 549 vst1q_f32(pSrc+104, cd6.val[0]); 550 vst1q_f32(pSrc+108, cd6.val[1]); 551 vst1q_f32(pSrc+112, cd7.val[0]); 552 vst1q_f32(pSrc+116, cd7.val[1]); 553 vst1q_f32(pSrc+120, cd8.val[0]); 554 vst1q_f32(pSrc+124, cd8.val[1]); 555 556 //Transpose 557 arm_bitonic_merge_64_f32(pSrc+0 , dir); 558 arm_bitonic_merge_64_f32(pSrc+64, dir); 559 } 560 561 static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir) 562 { 563 float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8; 564 float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16; 565 float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8; 566 float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16; 567 568 //Load and reverse second array 569 ab1.val[0] = vld1q_f32(pSrc+0 ); 570 ab1.val[1] = vld1q_f32(pSrc+4 ); 571 ab2.val[0] = vld1q_f32(pSrc+8 ); 572 ab2.val[1] = vld1q_f32(pSrc+12 ); 573 ab3.val[0] = vld1q_f32(pSrc+16 ); 574 ab3.val[1] = vld1q_f32(pSrc+20 ); 575 ab4.val[0] = vld1q_f32(pSrc+24 ); 576 ab4.val[1] = vld1q_f32(pSrc+28 ); 577 ab5.val[0] = vld1q_f32(pSrc+32 ); 578 ab5.val[1] = vld1q_f32(pSrc+36 ); 579 ab6.val[0] = vld1q_f32(pSrc+40 ); 580 ab6.val[1] = vld1q_f32(pSrc+44 ); 581 ab7.val[0] = vld1q_f32(pSrc+48 ); 582 ab7.val[1] = vld1q_f32(pSrc+52 ); 583 ab8.val[0] = vld1q_f32(pSrc+56 ); 584 ab8.val[1] = vld1q_f32(pSrc+60 ); 585 ab9.val[0] = vld1q_f32(pSrc+64 ); 586 ab9.val[1] = vld1q_f32(pSrc+68 ); 587 ab10.val[0] = vld1q_f32(pSrc+72 ); 588 ab10.val[1] = vld1q_f32(pSrc+76 ); 589 ab11.val[0] = vld1q_f32(pSrc+80 ); 590 ab11.val[1] = vld1q_f32(pSrc+84 ); 591 ab12.val[0] = vld1q_f32(pSrc+88 ); 592 ab12.val[1] = vld1q_f32(pSrc+92 ); 593 ab13.val[0] = vld1q_f32(pSrc+96 ); 594 ab13.val[1] = vld1q_f32(pSrc+100); 595 ab14.val[0] = vld1q_f32(pSrc+104); 596 ab14.val[1] = vld1q_f32(pSrc+108); 597 ab15.val[0] = vld1q_f32(pSrc+112); 598 ab15.val[1] = vld1q_f32(pSrc+116); 599 ab16.val[0] = vld1q_f32(pSrc+120); 600 ab16.val[1] = vld1q_f32(pSrc+124); 601 602 vldrev128q_f32(cd16.val[1], pSrc+128); 603 vldrev128q_f32(cd16.val[0], pSrc+132); 604 vldrev128q_f32(cd15.val[1], pSrc+136); 605 vldrev128q_f32(cd15.val[0], pSrc+140); 606 vldrev128q_f32(cd14.val[1], pSrc+144); 607 vldrev128q_f32(cd14.val[0], pSrc+148); 608 vldrev128q_f32(cd13.val[1], pSrc+152); 609 vldrev128q_f32(cd13.val[0], pSrc+156); 610 vldrev128q_f32(cd12.val[1], pSrc+160); 611 vldrev128q_f32(cd12.val[0], pSrc+164); 612 vldrev128q_f32(cd11.val[1], pSrc+168); 613 vldrev128q_f32(cd11.val[0], pSrc+172); 614 vldrev128q_f32(cd10.val[1], pSrc+176); 615 vldrev128q_f32(cd10.val[0], pSrc+180); 616 vldrev128q_f32(cd9.val[1] , pSrc+184); 617 vldrev128q_f32(cd9.val[0] , pSrc+188); 618 vldrev128q_f32(cd8.val[1] , pSrc+192); 619 vldrev128q_f32(cd8.val[0] , pSrc+196); 620 vldrev128q_f32(cd7.val[1] , pSrc+200); 621 vldrev128q_f32(cd7.val[0] , pSrc+204); 622 vldrev128q_f32(cd6.val[1] , pSrc+208); 623 vldrev128q_f32(cd6.val[0] , pSrc+212); 624 vldrev128q_f32(cd5.val[1] , pSrc+216); 625 vldrev128q_f32(cd5.val[0] , pSrc+220); 626 vldrev128q_f32(cd4.val[1] , pSrc+224); 627 vldrev128q_f32(cd4.val[0] , pSrc+228); 628 vldrev128q_f32(cd3.val[1] , pSrc+232); 629 vldrev128q_f32(cd3.val[0] , pSrc+236); 630 vldrev128q_f32(cd2.val[1] , pSrc+240); 631 vldrev128q_f32(cd2.val[0] , pSrc+244); 632 vldrev128q_f32(cd1.val[1] , pSrc+248); 633 vldrev128q_f32(cd1.val[0] , pSrc+252); 634 635 //Compare 636 if(dir) 637 { 638 vminmax256q(ab1 , cd1 ); 639 vminmax256q(ab2 , cd2 ); 640 vminmax256q(ab3 , cd3 ); 641 vminmax256q(ab4 , cd4 ); 642 vminmax256q(ab5 , cd5 ); 643 vminmax256q(ab6 , cd6 ); 644 vminmax256q(ab7 , cd7 ); 645 vminmax256q(ab8 , cd8 ); 646 vminmax256q(ab9 , cd9 ); 647 vminmax256q(ab10, cd10); 648 vminmax256q(ab11, cd11); 649 vminmax256q(ab12, cd12); 650 vminmax256q(ab13, cd13); 651 vminmax256q(ab14, cd14); 652 vminmax256q(ab15, cd15); 653 vminmax256q(ab16, cd16); 654 } 655 else 656 { 657 vminmax256q(cd1 , ab1 ); 658 vminmax256q(cd2 , ab2 ); 659 vminmax256q(cd3 , ab3 ); 660 vminmax256q(cd4 , ab4 ); 661 vminmax256q(cd5 , ab5 ); 662 vminmax256q(cd6 , ab6 ); 663 vminmax256q(cd7 , ab7 ); 664 vminmax256q(cd8 , ab8 ); 665 vminmax256q(cd9 , ab9 ); 666 vminmax256q(cd10, ab10); 667 vminmax256q(cd11, ab11); 668 vminmax256q(cd12, ab12); 669 vminmax256q(cd13, ab13); 670 vminmax256q(cd14, ab14); 671 vminmax256q(cd15, ab15); 672 vminmax256q(cd16, ab16); 673 } 674 675 //Transpose 676 float32x4_t temp; 677 678 temp = ab9.val[0]; 679 ab9.val[0] = cd1.val[0]; 680 cd1.val[0] = temp; 681 temp = ab9.val[1]; 682 ab9.val[1] = cd1.val[1]; 683 cd1.val[1] = temp; 684 temp = ab10.val[0]; 685 ab10.val[0] = cd2.val[0]; 686 cd2.val[0] = temp; 687 temp = ab10.val[1]; 688 ab10.val[1] = cd2.val[1]; 689 cd2.val[1] = temp; 690 temp = ab11.val[0]; 691 ab11.val[0] = cd3.val[0]; 692 cd3.val[0] = temp; 693 temp = ab11.val[1]; 694 ab11.val[1] = cd3.val[1]; 695 cd3.val[1] = temp; 696 temp = ab12.val[0]; 697 ab12.val[0] = cd4.val[0]; 698 cd4.val[0] = temp; 699 temp = ab12.val[1]; 700 ab12.val[1] = cd4.val[1]; 701 cd4.val[1] = temp; 702 temp = ab13.val[0]; 703 ab13.val[0] = cd5.val[0]; 704 cd5.val[0] = temp; 705 temp = ab13.val[1]; 706 ab13.val[1] = cd5.val[1]; 707 cd5.val[1] = temp; 708 temp = ab14.val[0]; 709 ab14.val[0] = cd6.val[0]; 710 cd6.val[0] = temp; 711 temp = ab14.val[1]; 712 ab14.val[1] = cd6.val[1]; 713 cd6.val[1] = temp; 714 temp = ab15.val[0]; 715 ab15.val[0] = cd7.val[0]; 716 cd7.val[0] = temp; 717 temp = ab15.val[1]; 718 ab15.val[1] = cd7.val[1]; 719 cd7.val[1] = temp; 720 temp = ab16.val[0]; 721 ab16.val[0] = cd8.val[0]; 722 cd8.val[0] = temp; 723 temp = ab16.val[1]; 724 ab16.val[1] = cd8.val[1]; 725 cd8.val[1] = temp; 726 727 //Compare 728 if(dir) 729 { 730 vminmax256q(ab1 , cd1 ); 731 vminmax256q(ab2 , cd2 ); 732 vminmax256q(ab3 , cd3 ); 733 vminmax256q(ab4 , cd4 ); 734 vminmax256q(ab5 , cd5 ); 735 vminmax256q(ab6 , cd6 ); 736 vminmax256q(ab7 , cd7 ); 737 vminmax256q(ab8 , cd8 ); 738 vminmax256q(ab9 , cd9 ); 739 vminmax256q(ab10, cd10); 740 vminmax256q(ab11, cd11); 741 vminmax256q(ab12, cd12); 742 vminmax256q(ab13, cd13); 743 vminmax256q(ab14, cd14); 744 vminmax256q(ab15, cd15); 745 vminmax256q(ab16, cd16); 746 } 747 else 748 { 749 vminmax256q(cd1 , ab1 ); 750 vminmax256q(cd2 , ab2 ); 751 vminmax256q(cd3 , ab3 ); 752 vminmax256q(cd4 , ab4 ); 753 vminmax256q(cd5 , ab5 ); 754 vminmax256q(cd6 , ab6 ); 755 vminmax256q(cd7 , ab7 ); 756 vminmax256q(cd8 , ab8 ); 757 vminmax256q(cd9 , ab9 ); 758 vminmax256q(cd10, ab10); 759 vminmax256q(cd11, ab11); 760 vminmax256q(cd12, ab12); 761 vminmax256q(cd13, ab13); 762 vminmax256q(cd14, ab14); 763 vminmax256q(cd15, ab15); 764 vminmax256q(cd16, ab16); 765 } 766 767 vst1q_f32(pSrc, ab1.val[0] ); 768 vst1q_f32(pSrc+4, ab1.val[1] ); 769 vst1q_f32(pSrc+8, ab2.val[0] ); 770 vst1q_f32(pSrc+12, ab2.val[1] ); 771 vst1q_f32(pSrc+16, ab3.val[0] ); 772 vst1q_f32(pSrc+20, ab3.val[1] ); 773 vst1q_f32(pSrc+24, ab4.val[0] ); 774 vst1q_f32(pSrc+28, ab4.val[1] ); 775 vst1q_f32(pSrc+32, ab5.val[0] ); 776 vst1q_f32(pSrc+36, ab5.val[1] ); 777 vst1q_f32(pSrc+40, ab6.val[0] ); 778 vst1q_f32(pSrc+44, ab6.val[1] ); 779 vst1q_f32(pSrc+48, ab7.val[0] ); 780 vst1q_f32(pSrc+52, ab7.val[1] ); 781 vst1q_f32(pSrc+56, ab8.val[0] ); 782 vst1q_f32(pSrc+60, ab8.val[1] ); 783 vst1q_f32(pSrc+64, cd1.val[0] ); 784 vst1q_f32(pSrc+68, cd1.val[1] ); 785 vst1q_f32(pSrc+72, cd2.val[0] ); 786 vst1q_f32(pSrc+76, cd2.val[1] ); 787 vst1q_f32(pSrc+80, cd3.val[0] ); 788 vst1q_f32(pSrc+84, cd3.val[1] ); 789 vst1q_f32(pSrc+88, cd4.val[0] ); 790 vst1q_f32(pSrc+92, cd4.val[1] ); 791 vst1q_f32(pSrc+96, cd5.val[0] ); 792 vst1q_f32(pSrc+100, cd5.val[1] ); 793 vst1q_f32(pSrc+104, cd6.val[0] ); 794 vst1q_f32(pSrc+108, cd6.val[1] ); 795 vst1q_f32(pSrc+112, cd7.val[0] ); 796 vst1q_f32(pSrc+116, cd7.val[1] ); 797 vst1q_f32(pSrc+120, cd8.val[0] ); 798 vst1q_f32(pSrc+124, cd8.val[1] ); 799 vst1q_f32(pSrc+128, ab9.val[0] ); 800 vst1q_f32(pSrc+132, ab9.val[1] ); 801 vst1q_f32(pSrc+136, ab10.val[0]); 802 vst1q_f32(pSrc+140, ab10.val[1]); 803 vst1q_f32(pSrc+144, ab11.val[0]); 804 vst1q_f32(pSrc+148, ab11.val[1]); 805 vst1q_f32(pSrc+152, ab12.val[0]); 806 vst1q_f32(pSrc+156, ab12.val[1]); 807 vst1q_f32(pSrc+160, ab13.val[0]); 808 vst1q_f32(pSrc+164, ab13.val[1]); 809 vst1q_f32(pSrc+168, ab14.val[0]); 810 vst1q_f32(pSrc+172, ab14.val[1]); 811 vst1q_f32(pSrc+176, ab15.val[0]); 812 vst1q_f32(pSrc+180, ab15.val[1]); 813 vst1q_f32(pSrc+184, ab16.val[0]); 814 vst1q_f32(pSrc+188, ab16.val[1]); 815 vst1q_f32(pSrc+192, cd9.val[0] ); 816 vst1q_f32(pSrc+196, cd9.val[1] ); 817 vst1q_f32(pSrc+200, cd10.val[0]); 818 vst1q_f32(pSrc+204, cd10.val[1]); 819 vst1q_f32(pSrc+208, cd11.val[0]); 820 vst1q_f32(pSrc+212, cd11.val[1]); 821 vst1q_f32(pSrc+216, cd12.val[0]); 822 vst1q_f32(pSrc+220, cd12.val[1]); 823 vst1q_f32(pSrc+224, cd13.val[0]); 824 vst1q_f32(pSrc+228, cd13.val[1]); 825 vst1q_f32(pSrc+232, cd14.val[0]); 826 vst1q_f32(pSrc+236, cd14.val[1]); 827 vst1q_f32(pSrc+240, cd15.val[0]); 828 vst1q_f32(pSrc+244, cd15.val[1]); 829 vst1q_f32(pSrc+248, cd16.val[0]); 830 vst1q_f32(pSrc+252, cd16.val[1]); 831 832 //Transpose 833 arm_bitonic_merge_128_f32(pSrc+0 , dir); 834 arm_bitonic_merge_128_f32(pSrc+128, dir); 835 } 836 837 #define SWAP(a,i,j) \ 838 temp = vgetq_lane_f32(a, j); \ 839 a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\ 840 a = vsetq_lane_f32(temp, a, i); 841 842 static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir) 843 { 844 float32_t temp; 845 846 847 if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) ) 848 { 849 SWAP(a,0,1); 850 } 851 if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) ) 852 { 853 SWAP(a,2,3); 854 } 855 856 if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) ) 857 { 858 SWAP(a,0,3); 859 } 860 if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) ) 861 { 862 SWAP(a,1,2); 863 } 864 865 if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) ) 866 { 867 SWAP(a,0,1); 868 } 869 if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) ) 870 { 871 SWAP(a,2,3); 872 } 873 874 return a; 875 } 876 877 static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir) 878 { 879 a = arm_bitonic_sort_4_f32(a, dir); 880 b = arm_bitonic_sort_4_f32(b, dir); 881 return arm_bitonic_merge_8_f32(a, b, dir); 882 } 883 884 885 886 #endif 887 888 /** 889 @ingroup groupSupport 890 */ 891 892 /** 893 @defgroup Sorting Vector sorting algorithms 894 895 Sort the elements of a vector 896 897 There are separate functions for floating-point, Q31, Q15, and Q7 data types. 898 */ 899 900 /** 901 @addtogroup Sorting 902 @{ 903 */ 904 905 /** 906 * @private 907 * @param[in] S points to an instance of the sorting structure. 908 * @param[in] pSrc points to the block of input data. 909 * @param[out] pDst points to the block of output data 910 * @param[in] blockSize number of samples to process. 911 */ 912 void arm_bitonic_sort_f32( 913 const arm_sort_instance_f32 * S, 914 float32_t * pSrc, 915 float32_t * pDst, 916 uint32_t blockSize) 917 { 918 uint16_t s, i; 919 uint8_t dir = S->dir; 920 921 #ifdef ARM_MATH_NEON 922 (void)s; 923 924 float32_t * pOut; 925 uint16_t counter = blockSize>>5; 926 927 if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only 928 { 929 if(pSrc == pDst) // in-place 930 pOut = pSrc; 931 else 932 pOut = pDst; 933 934 float32x4x2_t ab1, ab2; 935 float32x4x2_t cd1, cd2; 936 937 if(blockSize == 1) 938 pOut = pSrc; 939 else if(blockSize == 2) 940 { 941 float32_t temp; 942 943 if( dir==(pSrc[0]>pSrc[1]) ) 944 { 945 temp = pSrc[1]; 946 pOut[1] = pSrc[0]; 947 pOut[0] = temp; 948 } 949 else 950 pOut = pSrc; 951 } 952 else if(blockSize == 4) 953 { 954 float32x4_t a = vld1q_f32(pSrc); 955 956 a = arm_bitonic_sort_4_f32(a, dir); 957 958 vst1q_f32(pOut, a); 959 } 960 else if(blockSize == 8) 961 { 962 float32x4_t a; 963 float32x4_t b; 964 float32x4x2_t ab; 965 966 a = vld1q_f32(pSrc); 967 b = vld1q_f32(pSrc+4); 968 969 ab = arm_bitonic_sort_8_f32(a, b, dir); 970 971 vst1q_f32(pOut, ab.val[0]); 972 vst1q_f32(pOut+4, ab.val[1]); 973 } 974 else if(blockSize >=16) 975 { 976 // Order 16 bits long vectors 977 for(i=0; i<blockSize; i=i+16) 978 arm_bitonic_sort_16_f32(pSrc+i, pOut+i, dir); 979 980 // Merge 981 for(i=0; i<counter; i++) 982 { 983 // Load and reverse second vector 984 ab1.val[0] = vld1q_f32(pOut+32*i+0 ); 985 ab1.val[1] = vld1q_f32(pOut+32*i+4 ); 986 ab2.val[0] = vld1q_f32(pOut+32*i+8 ); 987 ab2.val[1] = vld1q_f32(pOut+32*i+12); 988 989 vldrev128q_f32(cd2.val[1], pOut+32*i+16); 990 vldrev128q_f32(cd2.val[0], pOut+32*i+20); 991 vldrev128q_f32(cd1.val[1], pOut+32*i+24); 992 vldrev128q_f32(cd1.val[0], pOut+32*i+28); 993 994 arm_bitonic_merge_32_f32(pOut+32*i, ab1, ab2, cd1, cd2, dir); 995 } 996 997 counter = counter>>1; 998 for(i=0; i<counter; i++) 999 arm_bitonic_merge_64_f32(pOut+64*i, dir); 1000 1001 counter = counter>>1; 1002 for(i=0; i<counter; i++) 1003 arm_bitonic_merge_128_f32(pOut+128*i, dir); 1004 1005 counter = counter>>1; 1006 for(i=0; i<counter; i++) 1007 arm_bitonic_merge_256_f32(pOut+256*i, dir); 1008 1009 // Etc... 1010 } 1011 } 1012 1013 #else 1014 1015 float32_t * pA; 1016 1017 if(pSrc != pDst) // out-of-place 1018 { 1019 memcpy(pDst, pSrc, blockSize*sizeof(float32_t) ); 1020 pA = pDst; 1021 } 1022 else 1023 pA = pSrc; 1024 1025 1026 if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only 1027 { 1028 for(s=2; s<=blockSize; s=s*2) 1029 { 1030 for(i=0; i<blockSize; i=i+s) 1031 arm_bitonic_sort_core_f32(pA+i, s, dir); 1032 } 1033 } 1034 #endif 1035 } 1036 1037 /** 1038 @} end of Sorting group 1039 */