/ Drivers / CMSIS / DSP / Source / SupportFunctions / arm_bitonic_sort_f32.c
arm_bitonic_sort_f32.c
   1  /* ----------------------------------------------------------------------
   2   * Project:      CMSIS DSP Library
   3   * Title:        arm_bitonic_sort_f32.c
   4   * Description:  Floating point bitonic sort
   5   *
   6   * $Date:        23 April 2021
   7   * $Revision:    V1.9.0
   8   *
   9   * Target Processor: Cortex-M and Cortex-A cores
  10   * -------------------------------------------------------------------- */
  11  /*
  12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13   *
  14   * SPDX-License-Identifier: Apache-2.0
  15   *
  16   * Licensed under the Apache License, Version 2.0 (the License); you may
  17   * not use this file except in compliance with the License.
  18   * You may obtain a copy of the License at
  19   *
  20   * www.apache.org/licenses/LICENSE-2.0
  21   *
  22   * Unless required by applicable law or agreed to in writing, software
  23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25   * See the License for the specific language governing permissions and
  26   * limitations under the License.
  27   */
  28  
  29  #include "dsp/support_functions.h"
  30  #include "arm_sorting.h"
  31  
  32  
  33  #if !defined(ARM_MATH_NEON)
  34  
  35  static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir)
  36  {
  37      uint32_t step;
  38      uint32_t k, j;
  39      float32_t *leftPtr, *rightPtr;
  40      float32_t temp;
  41  
  42      step = n>>1;
  43      leftPtr = pSrc;
  44      rightPtr = pSrc+n-1;
  45  
  46      for(k=0; k<step; k++)
  47      {
  48  	if(dir == (*leftPtr > *rightPtr))
  49  	{
  50              // Swap
  51  	    temp=*leftPtr;
  52  	    *leftPtr=*rightPtr;
  53  	    *rightPtr=temp;
  54  	}
  55  
  56  	leftPtr++;  // Move right
  57  	rightPtr--; // Move left
  58      }
  59  
  60      // Merge
  61      for(step=(n>>2); step>0; step/=2)
  62      {
  63  	for(j=0; j<n; j=j+step*2)
  64  	{
  65  	    leftPtr  = pSrc+j;
  66  	    rightPtr = pSrc+j+step;
  67  
  68  	    for(k=0; k<step; k++)
  69  	    {
  70  		if(*leftPtr > *rightPtr)
  71  		{
  72  		    // Swap
  73  	    	    temp=*leftPtr;
  74  		    *leftPtr=*rightPtr;
  75  		    *rightPtr=temp;
  76  		}
  77  
  78  		leftPtr++;
  79  		rightPtr++;
  80  	    }
  81  	}
  82      }
  83  }
  84  #endif
  85  
  86  #if defined(ARM_MATH_NEON)
  87  
  88  
  89  static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
  90  {
  91      /* Start with two vectors:
  92       * +---+---+---+---+
  93       * | a | b | c | d |
  94       * +---+---+---+---+
  95       * +---+---+---+---+
  96       * | e | f | g | h |
  97       * +---+---+---+---+
  98       * All the elements of the first are guaranteed to be less than or equal to
  99       * all of the elements in the second, and both vectors are bitonic.
 100       * We need to perform these operations to completely sort both lists:
 101       * vminmax([abcd],[efgh]) 
 102       * vminmax([acbd],[egfh]) 
 103       */
 104      vtrn128_64q(a, b);
 105      /* +---+---+---+---+
 106       * | a | b | e | f |
 107       * +---+---+---+---+
 108       * +---+---+---+---+
 109       * | c | d | g | h |
 110       * +---+---+---+---+
 111       */
 112      if(dir)
 113          vminmaxq(a, b);
 114      else
 115          vminmaxq(b, a);
 116      
 117      vtrn128_32q(a, b);
 118      /* +---+---+---+---+
 119       * | a | c | e | g |
 120       * +---+---+---+---+
 121       * +---+---+---+---+
 122       * | b | d | f | h |
 123       * +---+---+---+---+
 124       */
 125      if(dir)
 126          vminmaxq(a, b);
 127      else
 128          vminmaxq(b, a);
 129      
 130      return vzipq_f32(a, b);
 131  }
 132  
 133  
 134  static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
 135  {
 136      /* a and b are guaranteed to be bitonic */
 137      // Reverse the element of the second vector
 138      b = vrev128q_f32(b);
 139  
 140      // Compare the two vectors
 141      if(dir)
 142          vminmaxq(a, b);
 143      else
 144      vminmaxq(b, a);
 145  
 146      // Merge the two vectors
 147      float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir);
 148  
 149      return ab;
 150  }
 151  
 152  static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
 153  {
 154      /* Start with two vectors:
 155       * +---+---+---+---+---+---+---+---+
 156       * | a | b | c | d | e | f | g | h |
 157       * +---+---+---+---+---+---+---+---+
 158       * +---+---+---+---+---+---+---+---+
 159       * | i | j | k | l | m | n | o | p |
 160       * +---+---+---+---+---+---+---+---+
 161       * All the elements of the first are guaranteed to be less than or equal to
 162       * all of the elements in the second, and both vectors are bitonic.
 163       * We need to perform these operations to completely sort both lists:
 164       * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop])
 165       * vminmax([abef],[cdgh]) vminmax([ijmn],[klop])
 166       * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp])
 167       */
 168  
 169      vtrn256_128q(a, b);
 170      /* +---+---+---+---+---+---+---+---+
 171       * | a | b | c | d | i | j | k | l |
 172       * +---+---+---+---+---+---+---+---+
 173       * +---+---+---+---+---+---+---+---+
 174       * | e | f | g | h | m | n | o | p |
 175       * +---+---+---+---+---+---+---+---+
 176       */
 177      if(dir)
 178          vminmax256q(a, b);
 179      else
 180          vminmax256q(b, a);
 181      
 182      vtrn256_64q(a, b);
 183      
 184      /* +---+---+---+---+---+---+---+---+
 185       * | a | b | e | f | i | j | m | n |
 186       * +---+---+---+---+---+---+---+---+
 187       * +---+---+---+---+---+---+---+---+
 188       * | c | d | g | h | k | l | o | p |
 189       * +---+---+---+---+---+---+---+---+
 190       */
 191      if(dir)
 192          vminmax256q(a, b);
 193      else
 194          vminmax256q(b, a);
 195      
 196      vtrn256_32q(a, b);
 197      /* We now have:
 198       * +---+---+---+---+---+---+---+---+
 199       * | a | c | e | g | i | k | m | o |
 200       * +---+---+---+---+---+---+---+---+
 201       * +---+---+---+---+---+---+---+---+
 202       * | b | d | f | h | j | l | n | p |
 203       * +---+---+---+---+---+---+---+---+
 204       */
 205      if(dir)
 206          vminmax256q(a, b);
 207      else
 208          vminmax256q(b, a);
 209      
 210      float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]);
 211      float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]);
 212      
 213      vst1q_f32(pOut, out1.val[0]);
 214      vst1q_f32(pOut+4, out1.val[1]);
 215      vst1q_f32(pOut+8, out2.val[0]);
 216      vst1q_f32(pOut+12, out2.val[1]);
 217  } 
 218  
 219  static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
 220  {
 221      // Merge two preordered float32x4x2_t
 222      vrev256q_f32(b);
 223  
 224      if(dir)
 225          vminmax256q(a, b);
 226      else
 227          vminmax256q(b, a);
 228  
 229      arm_bitonic_resort_16_f32(pOut, a, b, dir);
 230  }
 231  
 232  static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir)
 233  {
 234      float32x4_t a;
 235      float32x4_t b;
 236      float32x4_t c;
 237      float32x4_t d;
 238  
 239      // Load 16 samples
 240      a = vld1q_f32(pSrc);
 241      b = vld1q_f32(pSrc+4);
 242      c = vld1q_f32(pSrc+8); 
 243      d = vld1q_f32(pSrc+12);
 244      
 245      // Bitonic sorting network for 4 samples x 4 times
 246      if(dir)
 247      {
 248          vminmaxq(a, b);
 249          vminmaxq(c, d);
 250          
 251          vminmaxq(a, d);
 252          vminmaxq(b, c);
 253          
 254          vminmaxq(a, b);
 255          vminmaxq(c, d);
 256      }
 257      else
 258      {
 259          vminmaxq(b, a);
 260          vminmaxq(d, c);
 261          
 262          vminmaxq(d, a);
 263          vminmaxq(c, b);
 264          
 265          vminmaxq(b, a);
 266          vminmaxq(d, c);
 267      }
 268  
 269      float32x4x2_t ab = vtrnq_f32 (a, b);
 270      float32x4x2_t cd = vtrnq_f32 (c, d);
 271      
 272      // Transpose 4 ordered arrays of 4 samples
 273      a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0]));
 274      b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1]));
 275      c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0]));
 276      d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1]));
 277  
 278      // Merge pairs of arrays of 4 samples
 279      ab = arm_bitonic_merge_8_f32(a, b, dir);
 280      cd = arm_bitonic_merge_8_f32(c, d, dir);
 281      
 282      // Merge arrays of 8 samples
 283      arm_bitonic_merge_16_f32(pDst, ab, cd, dir);
 284  }
 285  
 286  
 287  
 288  
 289  
 290  static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir)
 291  {
 292      //Compare
 293      if(dir)
 294      {
 295          vminmax256q(ab1, cd1);
 296          vminmax256q(ab2, cd2);
 297      }
 298      else
 299      {
 300          vminmax256q(cd1, ab1);
 301          vminmax256q(cd2, ab2);
 302      }
 303      //Transpose 256
 304      float32x4_t temp;
 305  
 306      temp = ab2.val[0];
 307      ab2.val[0] = cd1.val[0];
 308      cd1.val[0] = temp;
 309      temp = ab2.val[1];
 310      ab2.val[1] = cd1.val[1];
 311      cd1.val[1] = temp;
 312  
 313      //Compare
 314      if(dir)
 315      {
 316          vminmax256q(ab1, cd1);
 317          vminmax256q(ab2, cd2);
 318      }
 319      else
 320      {
 321          vminmax256q(cd1, ab1);
 322          vminmax256q(cd2, ab2);
 323      }
 324      
 325      //Transpose 128
 326      arm_bitonic_merge_16_f32(pSrc+0,  ab1, cd1, dir);
 327      arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir);
 328  }
 329  
 330  static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir)
 331  {
 332      float32x4x2_t ab1, ab2, ab3, ab4;
 333      float32x4x2_t cd1, cd2, cd3, cd4;
 334  
 335      //Load and reverse second array
 336      ab1.val[0] = vld1q_f32(pSrc+0 );
 337      ab1.val[1] = vld1q_f32(pSrc+4 );
 338      ab2.val[0] = vld1q_f32(pSrc+8 ); 
 339      ab2.val[1] = vld1q_f32(pSrc+12);
 340      ab3.val[0] = vld1q_f32(pSrc+16);
 341      ab3.val[1] = vld1q_f32(pSrc+20);
 342      ab4.val[0] = vld1q_f32(pSrc+24); 
 343      ab4.val[1] = vld1q_f32(pSrc+28);
 344  
 345      vldrev128q_f32(cd4.val[1], pSrc+32);
 346      vldrev128q_f32(cd4.val[0], pSrc+36);
 347      vldrev128q_f32(cd3.val[1], pSrc+40);
 348      vldrev128q_f32(cd3.val[0], pSrc+44);
 349      vldrev128q_f32(cd2.val[1], pSrc+48);
 350      vldrev128q_f32(cd2.val[0], pSrc+52);
 351      vldrev128q_f32(cd1.val[1], pSrc+56);
 352      vldrev128q_f32(cd1.val[0], pSrc+60);
 353      
 354      //Compare
 355      if(dir)
 356      {
 357          vminmax256q(ab1, cd1);
 358          vminmax256q(ab2, cd2);
 359          vminmax256q(ab3, cd3);
 360          vminmax256q(ab4, cd4);
 361      }
 362      else
 363      {
 364          vminmax256q(cd1, ab1);
 365          vminmax256q(cd2, ab2);
 366          vminmax256q(cd3, ab3);
 367          vminmax256q(cd4, ab4);
 368      }
 369  
 370      //Transpose 512
 371      float32x4_t temp;
 372  
 373      temp = ab3.val[0];
 374      ab3.val[0] = cd1.val[0];
 375      cd1.val[0] = temp;
 376      temp = ab3.val[1];
 377      ab3.val[1] = cd1.val[1];
 378      cd1.val[1] = temp;
 379      temp = ab4.val[0];
 380      ab4.val[0] = cd2.val[0];
 381      cd2.val[0] = temp;
 382      temp = ab4.val[1];
 383      ab4.val[1] = cd2.val[1];
 384      cd2.val[1] = temp;
 385  
 386      //Compare
 387      if(dir)
 388      {
 389          vminmax256q(ab1, cd1);
 390          vminmax256q(ab2, cd2);
 391          vminmax256q(ab3, cd3);
 392          vminmax256q(ab4, cd4);
 393      }
 394      else
 395      {
 396          vminmax256q(cd1, ab1);
 397          vminmax256q(cd2, ab2);
 398          vminmax256q(cd3, ab3);
 399          vminmax256q(cd4, ab4);
 400      }
 401      
 402      //Transpose 256
 403      arm_bitonic_merge_32_f32(pSrc+0,  ab1, ab2, cd1, cd2, dir);
 404      arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir);
 405  }
 406  
 407  static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir)
 408  {
 409      float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
 410      float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
 411  
 412      //Load and reverse second array
 413      ab1.val[0] = vld1q_f32(pSrc+0 );
 414      ab1.val[1] = vld1q_f32(pSrc+4 );
 415      ab2.val[0] = vld1q_f32(pSrc+8 ); 
 416      ab2.val[1] = vld1q_f32(pSrc+12);
 417      ab3.val[0] = vld1q_f32(pSrc+16);
 418      ab3.val[1] = vld1q_f32(pSrc+20);
 419      ab4.val[0] = vld1q_f32(pSrc+24); 
 420      ab4.val[1] = vld1q_f32(pSrc+28);
 421      ab5.val[0] = vld1q_f32(pSrc+32);
 422      ab5.val[1] = vld1q_f32(pSrc+36);
 423      ab6.val[0] = vld1q_f32(pSrc+40); 
 424      ab6.val[1] = vld1q_f32(pSrc+44);
 425      ab7.val[0] = vld1q_f32(pSrc+48);
 426      ab7.val[1] = vld1q_f32(pSrc+52);
 427      ab8.val[0] = vld1q_f32(pSrc+56); 
 428      ab8.val[1] = vld1q_f32(pSrc+60);
 429  
 430      vldrev128q_f32(cd8.val[1], pSrc+64);
 431      vldrev128q_f32(cd8.val[0], pSrc+68);
 432      vldrev128q_f32(cd7.val[1], pSrc+72);
 433      vldrev128q_f32(cd7.val[0], pSrc+76);
 434      vldrev128q_f32(cd6.val[1], pSrc+80);
 435      vldrev128q_f32(cd6.val[0], pSrc+84);
 436      vldrev128q_f32(cd5.val[1], pSrc+88);
 437      vldrev128q_f32(cd5.val[0], pSrc+92);
 438      vldrev128q_f32(cd4.val[1], pSrc+96);
 439      vldrev128q_f32(cd4.val[0], pSrc+100);
 440      vldrev128q_f32(cd3.val[1], pSrc+104);
 441      vldrev128q_f32(cd3.val[0], pSrc+108);
 442      vldrev128q_f32(cd2.val[1], pSrc+112);
 443      vldrev128q_f32(cd2.val[0], pSrc+116);
 444      vldrev128q_f32(cd1.val[1], pSrc+120);
 445      vldrev128q_f32(cd1.val[0], pSrc+124);
 446      
 447      //Compare
 448      if(dir)
 449      {
 450          vminmax256q(ab1, cd1);
 451          vminmax256q(ab2, cd2);
 452          vminmax256q(ab3, cd3);
 453          vminmax256q(ab4, cd4);
 454          vminmax256q(ab5, cd5);
 455          vminmax256q(ab6, cd6);
 456          vminmax256q(ab7, cd7);
 457          vminmax256q(ab8, cd8);
 458      }
 459      else
 460      {
 461          vminmax256q(cd1, ab1);
 462          vminmax256q(cd2, ab2);
 463          vminmax256q(cd3, ab3);
 464          vminmax256q(cd4, ab4);
 465          vminmax256q(cd5, ab5);
 466          vminmax256q(cd6, ab6);
 467          vminmax256q(cd7, ab7);
 468          vminmax256q(cd8, ab8);
 469      }
 470      
 471      //Transpose
 472      float32x4_t temp;
 473  
 474      temp = ab5.val[0];
 475      ab5.val[0] = cd1.val[0];
 476      cd1.val[0] = temp;
 477      temp = ab5.val[1];
 478      ab5.val[1] = cd1.val[1];
 479      cd1.val[1] = temp;
 480      temp = ab6.val[0];
 481      ab6.val[0] = cd2.val[0];
 482      cd2.val[0] = temp;
 483      temp = ab6.val[1];
 484      ab6.val[1] = cd2.val[1];
 485      cd2.val[1] = temp;
 486      temp = ab7.val[0];
 487      ab7.val[0] = cd3.val[0];
 488      cd3.val[0] = temp;
 489      temp = ab7.val[1];
 490      ab7.val[1] = cd3.val[1];
 491      cd3.val[1] = temp;
 492      temp = ab8.val[0];
 493      ab8.val[0] = cd4.val[0];
 494      cd4.val[0] = temp;
 495      temp = ab8.val[1];
 496      ab8.val[1] = cd4.val[1];
 497      cd4.val[1] = temp;
 498  
 499      //Compare
 500      if(dir)
 501      {
 502          vminmax256q(ab1, cd1);
 503          vminmax256q(ab2, cd2);
 504          vminmax256q(ab3, cd3);
 505          vminmax256q(ab4, cd4);
 506          vminmax256q(ab5, cd5);
 507          vminmax256q(ab6, cd6);
 508          vminmax256q(ab7, cd7);
 509          vminmax256q(ab8, cd8);
 510      }
 511      else
 512      {
 513          vminmax256q(cd1, ab1);
 514          vminmax256q(cd2, ab2);
 515          vminmax256q(cd3, ab3);
 516          vminmax256q(cd4, ab4);
 517          vminmax256q(cd5, ab5);
 518          vminmax256q(cd6, ab6);
 519          vminmax256q(cd7, ab7);
 520          vminmax256q(cd8, ab8);
 521      }
 522  
 523      vst1q_f32(pSrc,     ab1.val[0]);
 524      vst1q_f32(pSrc+4,   ab1.val[1]);
 525      vst1q_f32(pSrc+8,   ab2.val[0]);
 526      vst1q_f32(pSrc+12,  ab2.val[1]);
 527      vst1q_f32(pSrc+16,  ab3.val[0]);
 528      vst1q_f32(pSrc+20,  ab3.val[1]);
 529      vst1q_f32(pSrc+24,  ab4.val[0]);
 530      vst1q_f32(pSrc+28,  ab4.val[1]);
 531      vst1q_f32(pSrc+32,  cd1.val[0]);
 532      vst1q_f32(pSrc+36,  cd1.val[1]);
 533      vst1q_f32(pSrc+40,  cd2.val[0]);
 534      vst1q_f32(pSrc+44,  cd2.val[1]);
 535      vst1q_f32(pSrc+48,  cd3.val[0]);
 536      vst1q_f32(pSrc+52,  cd3.val[1]);
 537      vst1q_f32(pSrc+56,  cd4.val[0]);
 538      vst1q_f32(pSrc+60,  cd4.val[1]);
 539      vst1q_f32(pSrc+64,  ab5.val[0]);
 540      vst1q_f32(pSrc+68,  ab5.val[1]);
 541      vst1q_f32(pSrc+72,  ab6.val[0]);
 542      vst1q_f32(pSrc+76,  ab6.val[1]);
 543      vst1q_f32(pSrc+80,  ab7.val[0]);
 544      vst1q_f32(pSrc+84,  ab7.val[1]);
 545      vst1q_f32(pSrc+88,  ab8.val[0]);
 546      vst1q_f32(pSrc+92,  ab8.val[1]);
 547      vst1q_f32(pSrc+96,  cd5.val[0]);
 548      vst1q_f32(pSrc+100, cd5.val[1]);
 549      vst1q_f32(pSrc+104, cd6.val[0]);
 550      vst1q_f32(pSrc+108, cd6.val[1]);
 551      vst1q_f32(pSrc+112, cd7.val[0]);
 552      vst1q_f32(pSrc+116, cd7.val[1]);
 553      vst1q_f32(pSrc+120, cd8.val[0]);
 554      vst1q_f32(pSrc+124, cd8.val[1]);
 555  
 556      //Transpose
 557      arm_bitonic_merge_64_f32(pSrc+0 , dir);
 558      arm_bitonic_merge_64_f32(pSrc+64, dir);
 559  }
 560  
 561  static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir)
 562  {
 563      float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
 564      float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16;
 565      float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
 566      float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16;
 567  
 568      //Load and reverse second array
 569      ab1.val[0]  = vld1q_f32(pSrc+0  );
 570      ab1.val[1]  = vld1q_f32(pSrc+4  );
 571      ab2.val[0]  = vld1q_f32(pSrc+8  ); 
 572      ab2.val[1]  = vld1q_f32(pSrc+12 );
 573      ab3.val[0]  = vld1q_f32(pSrc+16 );
 574      ab3.val[1]  = vld1q_f32(pSrc+20 );
 575      ab4.val[0]  = vld1q_f32(pSrc+24 ); 
 576      ab4.val[1]  = vld1q_f32(pSrc+28 );
 577      ab5.val[0]  = vld1q_f32(pSrc+32 );
 578      ab5.val[1]  = vld1q_f32(pSrc+36 );
 579      ab6.val[0]  = vld1q_f32(pSrc+40 ); 
 580      ab6.val[1]  = vld1q_f32(pSrc+44 );
 581      ab7.val[0]  = vld1q_f32(pSrc+48 );
 582      ab7.val[1]  = vld1q_f32(pSrc+52 );
 583      ab8.val[0]  = vld1q_f32(pSrc+56 ); 
 584      ab8.val[1]  = vld1q_f32(pSrc+60 );
 585      ab9.val[0]  = vld1q_f32(pSrc+64 );
 586      ab9.val[1]  = vld1q_f32(pSrc+68 );
 587      ab10.val[0] = vld1q_f32(pSrc+72 ); 
 588      ab10.val[1] = vld1q_f32(pSrc+76 );
 589      ab11.val[0] = vld1q_f32(pSrc+80 );
 590      ab11.val[1] = vld1q_f32(pSrc+84 );
 591      ab12.val[0] = vld1q_f32(pSrc+88 ); 
 592      ab12.val[1] = vld1q_f32(pSrc+92 );
 593      ab13.val[0] = vld1q_f32(pSrc+96 );
 594      ab13.val[1] = vld1q_f32(pSrc+100);
 595      ab14.val[0] = vld1q_f32(pSrc+104); 
 596      ab14.val[1] = vld1q_f32(pSrc+108);
 597      ab15.val[0] = vld1q_f32(pSrc+112);
 598      ab15.val[1] = vld1q_f32(pSrc+116);
 599      ab16.val[0] = vld1q_f32(pSrc+120); 
 600      ab16.val[1] = vld1q_f32(pSrc+124);
 601  
 602      vldrev128q_f32(cd16.val[1], pSrc+128);
 603      vldrev128q_f32(cd16.val[0], pSrc+132);
 604      vldrev128q_f32(cd15.val[1], pSrc+136);
 605      vldrev128q_f32(cd15.val[0], pSrc+140);
 606      vldrev128q_f32(cd14.val[1], pSrc+144);
 607      vldrev128q_f32(cd14.val[0], pSrc+148);
 608      vldrev128q_f32(cd13.val[1], pSrc+152);
 609      vldrev128q_f32(cd13.val[0], pSrc+156);
 610      vldrev128q_f32(cd12.val[1], pSrc+160);
 611      vldrev128q_f32(cd12.val[0], pSrc+164);
 612      vldrev128q_f32(cd11.val[1], pSrc+168);
 613      vldrev128q_f32(cd11.val[0], pSrc+172);
 614      vldrev128q_f32(cd10.val[1], pSrc+176);
 615      vldrev128q_f32(cd10.val[0], pSrc+180);
 616      vldrev128q_f32(cd9.val[1] , pSrc+184);
 617      vldrev128q_f32(cd9.val[0] , pSrc+188);
 618      vldrev128q_f32(cd8.val[1] , pSrc+192);
 619      vldrev128q_f32(cd8.val[0] , pSrc+196);
 620      vldrev128q_f32(cd7.val[1] , pSrc+200);
 621      vldrev128q_f32(cd7.val[0] , pSrc+204);
 622      vldrev128q_f32(cd6.val[1] , pSrc+208);
 623      vldrev128q_f32(cd6.val[0] , pSrc+212);
 624      vldrev128q_f32(cd5.val[1] , pSrc+216);
 625      vldrev128q_f32(cd5.val[0] , pSrc+220);
 626      vldrev128q_f32(cd4.val[1] , pSrc+224);
 627      vldrev128q_f32(cd4.val[0] , pSrc+228);
 628      vldrev128q_f32(cd3.val[1] , pSrc+232);
 629      vldrev128q_f32(cd3.val[0] , pSrc+236);
 630      vldrev128q_f32(cd2.val[1] , pSrc+240);
 631      vldrev128q_f32(cd2.val[0] , pSrc+244);
 632      vldrev128q_f32(cd1.val[1] , pSrc+248);
 633      vldrev128q_f32(cd1.val[0] , pSrc+252);
 634      
 635      //Compare
 636      if(dir)
 637      {
 638          vminmax256q(ab1 , cd1 );
 639          vminmax256q(ab2 , cd2 );
 640          vminmax256q(ab3 , cd3 );
 641          vminmax256q(ab4 , cd4 );
 642          vminmax256q(ab5 , cd5 );
 643          vminmax256q(ab6 , cd6 );
 644          vminmax256q(ab7 , cd7 );
 645          vminmax256q(ab8 , cd8 );
 646          vminmax256q(ab9 , cd9 );
 647          vminmax256q(ab10, cd10);
 648          vminmax256q(ab11, cd11);
 649          vminmax256q(ab12, cd12);
 650          vminmax256q(ab13, cd13);
 651          vminmax256q(ab14, cd14);
 652          vminmax256q(ab15, cd15);
 653          vminmax256q(ab16, cd16);
 654      }
 655      else
 656      {
 657          vminmax256q(cd1 , ab1 );
 658          vminmax256q(cd2 , ab2 );
 659          vminmax256q(cd3 , ab3 );
 660          vminmax256q(cd4 , ab4 );
 661          vminmax256q(cd5 , ab5 );
 662          vminmax256q(cd6 , ab6 );
 663          vminmax256q(cd7 , ab7 );
 664          vminmax256q(cd8 , ab8 );
 665          vminmax256q(cd9 , ab9 );
 666          vminmax256q(cd10, ab10);
 667          vminmax256q(cd11, ab11);
 668          vminmax256q(cd12, ab12);
 669          vminmax256q(cd13, ab13);
 670          vminmax256q(cd14, ab14);
 671          vminmax256q(cd15, ab15);
 672          vminmax256q(cd16, ab16);
 673      }
 674  
 675      //Transpose
 676      float32x4_t temp;
 677  
 678      temp = ab9.val[0];
 679      ab9.val[0] = cd1.val[0];
 680      cd1.val[0] = temp;
 681      temp = ab9.val[1];
 682      ab9.val[1] = cd1.val[1];
 683      cd1.val[1] = temp;
 684      temp = ab10.val[0];
 685      ab10.val[0] = cd2.val[0];
 686      cd2.val[0] = temp;
 687      temp = ab10.val[1];
 688      ab10.val[1] = cd2.val[1];
 689      cd2.val[1] = temp;
 690      temp = ab11.val[0];
 691      ab11.val[0] = cd3.val[0];
 692      cd3.val[0] = temp;
 693      temp = ab11.val[1];
 694      ab11.val[1] = cd3.val[1];
 695      cd3.val[1] = temp;
 696      temp = ab12.val[0];
 697      ab12.val[0] = cd4.val[0];
 698      cd4.val[0] = temp;
 699      temp = ab12.val[1];
 700      ab12.val[1] = cd4.val[1];
 701      cd4.val[1] = temp;
 702      temp = ab13.val[0];
 703      ab13.val[0] = cd5.val[0];
 704      cd5.val[0] = temp;
 705      temp = ab13.val[1];
 706      ab13.val[1] = cd5.val[1];
 707      cd5.val[1] = temp;
 708      temp = ab14.val[0];
 709      ab14.val[0] = cd6.val[0];
 710      cd6.val[0] = temp;
 711      temp = ab14.val[1];
 712      ab14.val[1] = cd6.val[1];
 713      cd6.val[1] = temp;
 714      temp = ab15.val[0];
 715      ab15.val[0] = cd7.val[0];
 716      cd7.val[0] = temp;
 717      temp = ab15.val[1];
 718      ab15.val[1] = cd7.val[1];
 719      cd7.val[1] = temp;
 720      temp = ab16.val[0];
 721      ab16.val[0] = cd8.val[0];
 722      cd8.val[0] = temp;
 723      temp = ab16.val[1];
 724      ab16.val[1] = cd8.val[1];
 725      cd8.val[1] = temp;
 726  
 727      //Compare
 728      if(dir)
 729      {
 730          vminmax256q(ab1 , cd1 );
 731          vminmax256q(ab2 , cd2 );
 732          vminmax256q(ab3 , cd3 );
 733          vminmax256q(ab4 , cd4 );
 734          vminmax256q(ab5 , cd5 );
 735          vminmax256q(ab6 , cd6 );
 736          vminmax256q(ab7 , cd7 );
 737          vminmax256q(ab8 , cd8 );
 738          vminmax256q(ab9 , cd9 );
 739          vminmax256q(ab10, cd10);
 740          vminmax256q(ab11, cd11);
 741          vminmax256q(ab12, cd12);
 742          vminmax256q(ab13, cd13);
 743          vminmax256q(ab14, cd14);
 744          vminmax256q(ab15, cd15);
 745          vminmax256q(ab16, cd16);
 746      }
 747      else
 748      {
 749          vminmax256q(cd1 , ab1 );
 750          vminmax256q(cd2 , ab2 );
 751          vminmax256q(cd3 , ab3 );
 752          vminmax256q(cd4 , ab4 );
 753          vminmax256q(cd5 , ab5 );
 754          vminmax256q(cd6 , ab6 );
 755          vminmax256q(cd7 , ab7 );
 756          vminmax256q(cd8 , ab8 );
 757          vminmax256q(cd9 , ab9 );
 758          vminmax256q(cd10, ab10);
 759          vminmax256q(cd11, ab11);
 760          vminmax256q(cd12, ab12);
 761          vminmax256q(cd13, ab13);
 762          vminmax256q(cd14, ab14);
 763          vminmax256q(cd15, ab15);
 764          vminmax256q(cd16, ab16);
 765      }
 766  
 767      vst1q_f32(pSrc,     ab1.val[0] );
 768      vst1q_f32(pSrc+4,   ab1.val[1] );
 769      vst1q_f32(pSrc+8,   ab2.val[0] );
 770      vst1q_f32(pSrc+12,  ab2.val[1] );
 771      vst1q_f32(pSrc+16,  ab3.val[0] );
 772      vst1q_f32(pSrc+20,  ab3.val[1] );
 773      vst1q_f32(pSrc+24,  ab4.val[0] );
 774      vst1q_f32(pSrc+28,  ab4.val[1] );
 775      vst1q_f32(pSrc+32,  ab5.val[0] );
 776      vst1q_f32(pSrc+36,  ab5.val[1] );
 777      vst1q_f32(pSrc+40,  ab6.val[0] );
 778      vst1q_f32(pSrc+44,  ab6.val[1] );
 779      vst1q_f32(pSrc+48,  ab7.val[0] );
 780      vst1q_f32(pSrc+52,  ab7.val[1] );
 781      vst1q_f32(pSrc+56,  ab8.val[0] );
 782      vst1q_f32(pSrc+60,  ab8.val[1] );
 783      vst1q_f32(pSrc+64,  cd1.val[0] );
 784      vst1q_f32(pSrc+68,  cd1.val[1] );
 785      vst1q_f32(pSrc+72,  cd2.val[0] );
 786      vst1q_f32(pSrc+76,  cd2.val[1] );
 787      vst1q_f32(pSrc+80,  cd3.val[0] );
 788      vst1q_f32(pSrc+84,  cd3.val[1] );
 789      vst1q_f32(pSrc+88,  cd4.val[0] );
 790      vst1q_f32(pSrc+92,  cd4.val[1] );
 791      vst1q_f32(pSrc+96,  cd5.val[0] );
 792      vst1q_f32(pSrc+100, cd5.val[1] );
 793      vst1q_f32(pSrc+104, cd6.val[0] );
 794      vst1q_f32(pSrc+108, cd6.val[1] );
 795      vst1q_f32(pSrc+112, cd7.val[0] );
 796      vst1q_f32(pSrc+116, cd7.val[1] );
 797      vst1q_f32(pSrc+120, cd8.val[0] );
 798      vst1q_f32(pSrc+124, cd8.val[1] );
 799      vst1q_f32(pSrc+128, ab9.val[0] );
 800      vst1q_f32(pSrc+132, ab9.val[1] );
 801      vst1q_f32(pSrc+136, ab10.val[0]);
 802      vst1q_f32(pSrc+140, ab10.val[1]);
 803      vst1q_f32(pSrc+144, ab11.val[0]);
 804      vst1q_f32(pSrc+148, ab11.val[1]);
 805      vst1q_f32(pSrc+152, ab12.val[0]);
 806      vst1q_f32(pSrc+156, ab12.val[1]);
 807      vst1q_f32(pSrc+160, ab13.val[0]);
 808      vst1q_f32(pSrc+164, ab13.val[1]);
 809      vst1q_f32(pSrc+168, ab14.val[0]);
 810      vst1q_f32(pSrc+172, ab14.val[1]);
 811      vst1q_f32(pSrc+176, ab15.val[0]);
 812      vst1q_f32(pSrc+180, ab15.val[1]);
 813      vst1q_f32(pSrc+184, ab16.val[0]);
 814      vst1q_f32(pSrc+188, ab16.val[1]);
 815      vst1q_f32(pSrc+192, cd9.val[0] );
 816      vst1q_f32(pSrc+196, cd9.val[1] );
 817      vst1q_f32(pSrc+200, cd10.val[0]);
 818      vst1q_f32(pSrc+204, cd10.val[1]);
 819      vst1q_f32(pSrc+208, cd11.val[0]);
 820      vst1q_f32(pSrc+212, cd11.val[1]);
 821      vst1q_f32(pSrc+216, cd12.val[0]);
 822      vst1q_f32(pSrc+220, cd12.val[1]);
 823      vst1q_f32(pSrc+224, cd13.val[0]);
 824      vst1q_f32(pSrc+228, cd13.val[1]);
 825      vst1q_f32(pSrc+232, cd14.val[0]);
 826      vst1q_f32(pSrc+236, cd14.val[1]);
 827      vst1q_f32(pSrc+240, cd15.val[0]);
 828      vst1q_f32(pSrc+244, cd15.val[1]);
 829      vst1q_f32(pSrc+248, cd16.val[0]);
 830      vst1q_f32(pSrc+252, cd16.val[1]);
 831  
 832      //Transpose
 833      arm_bitonic_merge_128_f32(pSrc+0  , dir);
 834      arm_bitonic_merge_128_f32(pSrc+128, dir);
 835  }
 836  
 837  #define SWAP(a,i,j)                            \
 838      temp = vgetq_lane_f32(a, j);                   \
 839      a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\
 840      a = vsetq_lane_f32(temp, a, i);
 841  
 842  static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir)
 843  {
 844      float32_t temp;
 845  
 846  
 847      if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
 848      {
 849          SWAP(a,0,1);
 850      }
 851      if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) )
 852      {
 853         SWAP(a,2,3);
 854      }
 855  
 856      if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) )
 857      {
 858        SWAP(a,0,3);
 859      }
 860      if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) )
 861      {
 862        SWAP(a,1,2);
 863      }
 864  
 865      if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
 866      {
 867        SWAP(a,0,1);
 868      }
 869      if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) )
 870      {
 871        SWAP(a,2,3);
 872      }
 873  
 874      return a;
 875  }
 876  
 877  static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
 878  {
 879      a = arm_bitonic_sort_4_f32(a, dir);
 880      b = arm_bitonic_sort_4_f32(b, dir);
 881      return arm_bitonic_merge_8_f32(a, b, dir);
 882  }
 883  
 884  
 885  
 886  #endif
 887  
 888  /**
 889    @ingroup groupSupport
 890   */
 891  
 892  /**
 893    @defgroup Sorting Vector sorting algorithms
 894  
 895    Sort the elements of a vector
 896  
 897    There are separate functions for floating-point, Q31, Q15, and Q7 data types.
 898   */
 899  
 900  /**
 901    @addtogroup Sorting
 902    @{
 903   */
 904  
 905  /**
 906     * @private
 907     * @param[in]  S          points to an instance of the sorting structure.
 908     * @param[in]  pSrc       points to the block of input data.
 909     * @param[out] pDst       points to the block of output data
 910     * @param[in]  blockSize  number of samples to process.
 911     */
 912  void arm_bitonic_sort_f32(
 913  const arm_sort_instance_f32 * S, 
 914        float32_t * pSrc,
 915        float32_t * pDst, 
 916        uint32_t blockSize)
 917  {
 918      uint16_t s, i;
 919      uint8_t dir = S->dir;
 920  
 921  #ifdef ARM_MATH_NEON
 922      (void)s;
 923  
 924      float32_t * pOut;
 925      uint16_t counter = blockSize>>5;
 926  
 927      if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
 928      {
 929          if(pSrc == pDst) // in-place
 930              pOut = pSrc;
 931          else
 932      	    pOut = pDst;
 933      
 934          float32x4x2_t ab1, ab2;
 935          float32x4x2_t cd1, cd2;
 936  
 937  	if(blockSize == 1)
 938  		pOut = pSrc;
 939  	else if(blockSize == 2)
 940  	{
 941              float32_t temp;
 942              
 943              if( dir==(pSrc[0]>pSrc[1]) )
 944              {
 945                  temp = pSrc[1];
 946                  pOut[1] = pSrc[0];
 947                  pOut[0] = temp;
 948              }
 949  	    else
 950  		pOut = pSrc;
 951  	}
 952  	else if(blockSize == 4)
 953          {
 954      	    float32x4_t a = vld1q_f32(pSrc);
 955  
 956      	    a = arm_bitonic_sort_4_f32(a, dir);
 957  
 958      	    vst1q_f32(pOut, a);
 959          }
 960          else if(blockSize == 8)
 961          {
 962              float32x4_t a;
 963              float32x4_t b;
 964              float32x4x2_t ab;
 965          
 966              a = vld1q_f32(pSrc);
 967              b = vld1q_f32(pSrc+4);
 968          
 969              ab = arm_bitonic_sort_8_f32(a, b, dir);
 970  
 971              vst1q_f32(pOut,   ab.val[0]);
 972              vst1q_f32(pOut+4, ab.val[1]);
 973          }
 974          else if(blockSize >=16)
 975          {
 976              // Order 16 bits long vectors
 977              for(i=0; i<blockSize; i=i+16)
 978                  arm_bitonic_sort_16_f32(pSrc+i, pOut+i, dir);
 979          
 980              // Merge
 981              for(i=0; i<counter; i++)
 982              {
 983                  // Load and reverse second vector
 984                  ab1.val[0] = vld1q_f32(pOut+32*i+0 );
 985                  ab1.val[1] = vld1q_f32(pOut+32*i+4 );
 986                  ab2.val[0] = vld1q_f32(pOut+32*i+8 ); 
 987                  ab2.val[1] = vld1q_f32(pOut+32*i+12);
 988  
 989                  vldrev128q_f32(cd2.val[1], pOut+32*i+16);
 990                  vldrev128q_f32(cd2.val[0], pOut+32*i+20);
 991                  vldrev128q_f32(cd1.val[1], pOut+32*i+24);
 992                  vldrev128q_f32(cd1.val[0], pOut+32*i+28);
 993  
 994                  arm_bitonic_merge_32_f32(pOut+32*i, ab1, ab2, cd1, cd2, dir);
 995              }
 996          
 997              counter = counter>>1;
 998              for(i=0; i<counter; i++)
 999                  arm_bitonic_merge_64_f32(pOut+64*i, dir);
1000          
1001              counter = counter>>1;
1002              for(i=0; i<counter; i++)
1003                  arm_bitonic_merge_128_f32(pOut+128*i, dir);
1004          
1005              counter = counter>>1;
1006              for(i=0; i<counter; i++)
1007                  arm_bitonic_merge_256_f32(pOut+256*i, dir);
1008  
1009              // Etc...
1010          }
1011      }
1012  
1013  #else
1014  
1015      float32_t * pA;
1016  
1017      if(pSrc != pDst) // out-of-place
1018      {   
1019          memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
1020          pA = pDst;
1021      }
1022      else
1023          pA = pSrc;
1024  
1025  
1026      if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
1027      {
1028          for(s=2; s<=blockSize; s=s*2)
1029          {
1030      	    for(i=0; i<blockSize; i=i+s)
1031      	        arm_bitonic_sort_core_f32(pA+i, s, dir);
1032          }
1033      }
1034  #endif
1035  }
1036  
1037  /**
1038    @} end of Sorting group
1039   */