/ libcsdr.c
libcsdr.c
   1  /*
   2  This software is part of libcsdr, a set of simple DSP routines for
   3  Software Defined Radio.
   4  
   5  Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
   6  All rights reserved.
   7  
   8  Redistribution and use in source and binary forms, with or without
   9  modification, are permitted provided that the following conditions are met:
  10      * Redistributions of source code must retain the above copyright
  11        notice, this list of conditions and the following disclaimer.
  12      * Redistributions in binary form must reproduce the above copyright
  13        notice, this list of conditions and the following disclaimer in the
  14        documentation and/or other materials provided with the distribution.
  15      * Neither the name of the copyright holder nor the
  16        names of its contributors may be used to endorse or promote products
  17        derived from this software without specific prior written permission.
  18  
  19  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  20  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  21  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22  DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
  23  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  24  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  26  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  28  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  */
  30  
  31  #include <stdio.h>
  32  #include <time.h>
  33  #include <math.h>
  34  #include <stdlib.h>
  35  #include <string.h>
  36  #include <unistd.h>
  37  #include <limits.h>
  38  #include "libcsdr.h"
  39  #include "predefined.h"
  40  #include <assert.h>
  41  #include <stdarg.h>
  42  
  43  /*
  44             _           _                   __                  _   _
  45            (_)         | |                 / _|                | | (_)
  46   __      ___ _ __   __| | _____      __  | |_ _   _ _ __   ___| |_ _  ___  _ __  ___
  47   \ \ /\ / / | '_ \ / _` |/ _ \ \ /\ / /  |  _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
  48    \ V  V /| | | | | (_| | (_) \ V  V /   | | | |_| | | | | (__| |_| | (_) | | | \__ \
  49     \_/\_/ |_|_| |_|\__,_|\___/ \_/\_/    |_|  \__,_|_| |_|\___|\__|_|\___/|_| |_|___/
  50  
  51  
  52  */
  53  
  54  #define MFIRDES_GWS(NAME) \
  55      if(!strcmp( #NAME , input )) return WINDOW_ ## NAME;
  56  
  57  window_t firdes_get_window_from_string(char* input)
  58  {
  59      MFIRDES_GWS(BOXCAR);
  60      MFIRDES_GWS(BLACKMAN);
  61      MFIRDES_GWS(HAMMING);
  62      return WINDOW_DEFAULT;
  63  }
  64  
  65  #define MFIRDES_GSW(NAME) \
  66      if(window == WINDOW_ ## NAME) return #NAME;
  67  
  68  char* firdes_get_string_from_window(window_t window)
  69  {
  70      MFIRDES_GSW(BOXCAR);
  71      MFIRDES_GSW(BLACKMAN);
  72      MFIRDES_GSW(HAMMING);
  73      return "INVALID";
  74  }
  75  
  76  float firdes_wkernel_blackman(float rate)
  77  {
  78      //Explanation at Chapter 16 of dspguide.com, page 2
  79      //Blackman window has better stopband attentuation and passband ripple than Hamming, but it has slower rolloff.
  80      rate=0.5+rate/2;
  81      return 0.42-0.5*cos(2*PI*rate)+0.08*cos(4*PI*rate);
  82  }
  83  
  84  float firdes_wkernel_hamming(float rate)
  85  {
  86      //Explanation at Chapter 16 of dspguide.com, page 2
  87      //Hamming window has worse stopband attentuation and passband ripple than Blackman, but it has faster rolloff.
  88      rate=0.5+rate/2;
  89      return 0.54-0.46*cos(2*PI*rate);
  90  }
  91  
  92  
  93  float firdes_wkernel_boxcar(float rate)
  94  {   //"Dummy" window kernel, do not use; an unwindowed FIR filter may have bad frequency response
  95      return 1.0;
  96  }
  97  
  98  float (*firdes_get_window_kernel(window_t window))(float)
  99  {
 100      if(window==WINDOW_HAMMING) return firdes_wkernel_hamming;
 101      else if(window==WINDOW_BLACKMAN) return firdes_wkernel_blackman;
 102      else if(window==WINDOW_BOXCAR) return firdes_wkernel_boxcar;
 103      else return firdes_get_window_kernel(WINDOW_DEFAULT);
 104  }
 105  
 106  /*
 107    ______ _____ _____      __ _ _ _                   _           _
 108   |  ____|_   _|  __ \    / _(_) | |                 | |         (_)
 109   | |__    | | | |__) |  | |_ _| | |_ ___ _ __     __| | ___  ___ _  __ _ _ __
 110   |  __|   | | |  _  /   |  _| | | __/ _ \ '__|   / _` |/ _ \/ __| |/ _` | '_ \
 111   | |     _| |_| | \ \   | | | | | ||  __/ |     | (_| |  __/\__ \ | (_| | | | |
 112   |_|    |_____|_|  \_\  |_| |_|_|\__\___|_|      \__,_|\___||___/_|\__, |_| |_|
 113                                                                      __/ |
 114                                                                     |___/
 115  */
 116  
 117  void normalize_fir_f(float* input, float* output, int length)
 118  {
 119      //Normalize filter kernel
 120      float sum=0;
 121      for(int i=0;i<length;i++) //@normalize_fir_f: normalize pass 1
 122          sum+=input[i];
 123      for(int i=0;i<length;i++) //@normalize_fir_f: normalize pass 2
 124          output[i]=input[i]/sum;
 125  }
 126  
 127  void firdes_lowpass_f(float *output, int length, float cutoff_rate, window_t window)
 128  {   //Generates symmetric windowed sinc FIR filter real taps
 129      //  length should be odd
 130      //  cutoff_rate is (cutoff frequency/sampling frequency)
 131      //Explanation at Chapter 16 of dspguide.com
 132      int middle=length/2;
 133      float temp;
 134      float (*window_function)(float)  = firdes_get_window_kernel(window);
 135      output[middle]=2*PI*cutoff_rate*window_function(0);
 136      for(int i=1; i<=middle; i++) //@@firdes_lowpass_f: calculate taps
 137      {
 138          output[middle-i]=output[middle+i]=(sin(2*PI*cutoff_rate*i)/i)*window_function((float)i/middle);
 139          //printf("%g %d %d %d %d | %g\n",output[middle-i],i,middle,middle+i,middle-i,sin(2*PI*cutoff_rate*i));
 140      }
 141      normalize_fir_f(output,output,length);
 142  }
 143  
 144  void firdes_bandpass_c(complexf *output, int length, float lowcut, float highcut, window_t window)
 145  {
 146      //To generate a complex filter:
 147      //  1. we generate a real lowpass filter with a bandwidth of highcut-lowcut
 148      //  2. we shift the filter taps spectrally by multiplying with e^(j*w), so we get complex taps
 149      //(tnx HA5FT)
 150      float* realtaps = (float*)malloc(sizeof(float)*length);
 151  
 152      firdes_lowpass_f(realtaps, length, (highcut-lowcut)/2, window);
 153      float filter_center=(highcut+lowcut)/2;
 154  
 155      float phase=0, sinval, cosval;
 156      for(int i=0; i<length; i++) //@@firdes_bandpass_c
 157      {
 158          cosval=cos(phase);
 159          sinval=sin(phase);
 160          phase+=2*PI*filter_center;
 161          while(phase>2*PI) phase-=2*PI; //@@firdes_bandpass_c
 162          while(phase<0) phase+=2*PI;
 163          iof(output,i)=cosval*realtaps[i];
 164          qof(output,i)=sinval*realtaps[i];
 165          //output[i] := realtaps[i] * e^j*w
 166      }
 167  }
 168  
 169  int firdes_filter_len(float transition_bw)
 170  {
 171      int result=4.0/transition_bw;
 172      if (result%2==0) result++; //number of symmetric FIR filter taps should be odd
 173      return result;
 174  }
 175  
 176  /*
 177    _____   _____ _____      __                  _   _
 178   |  __ \ / ____|  __ \    / _|                | | (_)
 179   | |  | | (___ | |__) |  | |_ _   _ _ __   ___| |_ _  ___  _ __  ___
 180   | |  | |\___ \|  ___/   |  _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
 181   | |__| |____) | |       | | | |_| | | | | (__| |_| | (_) | | | \__ \
 182   |_____/|_____/|_|       |_|  \__,_|_| |_|\___|\__|_|\___/|_| |_|___/
 183  
 184  */
 185  
 186  float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase)
 187  {
 188      rate*=2;
 189      //Shifts the complex spectrum. Basically a complex mixer. This version uses cmath.
 190      float phase=starting_phase;
 191      float phase_increment=rate*PI;
 192      float cosval, sinval;
 193      for(int i=0;i<input_size; i++) //@shift_math_cc
 194      {
 195          cosval=cos(phase);
 196          sinval=sin(phase);
 197          //we multiply two complex numbers.
 198          //how? enter this to maxima (software) for explanation:
 199          //   (a+b*%i)*(c+d*%i), rectform;
 200          iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
 201          qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
 202          phase+=phase_increment;
 203          while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
 204          while(phase<0) phase+=2*PI;
 205      }
 206      return phase;
 207  }
 208  
 209  
 210  
 211  shift_table_data_t shift_table_init(int table_size)
 212  {
 213      //RTODO
 214      shift_table_data_t output;
 215      output.table=(float*)malloc(sizeof(float)*table_size);
 216      output.table_size=table_size;
 217      for(int i=0;i<table_size;i++)
 218      {
 219          output.table[i]=sin(((float)i/table_size)*(PI/2));
 220      }
 221      return output;
 222  }
 223  
 224  void shift_table_deinit(shift_table_data_t table_data)
 225  {
 226      free(table_data.table);
 227  }
 228  
 229  float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase)
 230  {
 231      //RTODO
 232      rate*=2;
 233      //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table.
 234      float phase=starting_phase;
 235      float phase_increment=rate*PI;
 236      float cosval, sinval;
 237      for(int i=0;i<input_size; i++) //@shift_math_cc
 238      {
 239          int sin_index, cos_index, temp_index, sin_sign, cos_sign;
 240          //float vphase=fmodf(phase,PI/2); //between 0 and 90deg
 241          int quadrant=phase/(PI/2); //between 0 and 3
 242          float vphase=phase-quadrant*(PI/2);
 243          sin_index=(vphase/(PI/2))*table_data.table_size;
 244          cos_index=table_data.table_size-1-sin_index;
 245          if(quadrant&1) //in quadrant 1 and 3
 246          {
 247              temp_index=sin_index;
 248              sin_index=cos_index;
 249              cos_index=temp_index;
 250          }
 251          sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3
 252          cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2
 253          sinval=sin_sign*table_data.table[sin_index];
 254          cosval=cos_sign*table_data.table[cos_index];
 255          //we multiply two complex numbers.
 256          //how? enter this to maxima (software) for explanation:
 257          //   (a+b*%i)*(c+d*%i), rectform;
 258          iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
 259          qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
 260          phase+=phase_increment;
 261          while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
 262          while(phase<0) phase+=2*PI;
 263      }
 264      return phase;
 265  }
 266  
 267  
 268  shift_unroll_data_t shift_unroll_init(float rate, int size)
 269  {
 270      shift_unroll_data_t output;
 271      output.phase_increment=2*rate*PI;
 272      output.size = size;
 273      output.dsin=(float*)malloc(sizeof(float)*size);
 274      output.dcos=(float*)malloc(sizeof(float)*size);
 275      float myphase = 0;
 276      for(int i=0;i<size;i++)
 277      {
 278          myphase += output.phase_increment;
 279          while(myphase>PI) myphase-=2*PI;
 280          while(myphase<-PI) myphase+=2*PI;
 281          output.dsin[i]=sin(myphase);
 282          output.dcos[i]=cos(myphase);
 283      }
 284      return output;
 285  }
 286  
 287  float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
 288  {
 289      //input_size should be multiple of 4
 290      //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
 291      float cos_start=cos(starting_phase);
 292      float sin_start=sin(starting_phase);
 293      register float cos_val, sin_val;
 294      for(int i=0;i<input_size; i++) //@shift_unroll_cc
 295      {
 296          cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
 297          sin_val  = sin_start * d->dcos[i] + cos_start * d->dsin[i];
 298          iof(output,i)=cos_val*iof(input,i)-sin_val*qof(input,i);
 299          qof(output,i)=sin_val*iof(input,i)+cos_val*qof(input,i);
 300      }
 301      starting_phase+=input_size*d->phase_increment;
 302      while(starting_phase>PI) starting_phase-=2*PI;
 303      while(starting_phase<-PI) starting_phase+=2*PI;
 304      return starting_phase;
 305  }
 306  
 307  shift_addfast_data_t shift_addfast_init(float rate)
 308  {
 309      shift_addfast_data_t output;
 310      output.phase_increment=2*rate*PI;
 311      for(int i=0;i<4;i++)
 312      {
 313          output.dsin[i]=sin(output.phase_increment*(i+1));
 314          output.dcos[i]=cos(output.phase_increment*(i+1));
 315      }
 316      return output;
 317  }
 318  
 319  #ifdef NEON_OPTS
 320  #pragma message "Manual NEON optimizations are ON: we have a faster shift_addfast_cc now."
 321  
 322  float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
 323  {
 324      //input_size should be multiple of 4
 325      float cos_start[4], sin_start[4];
 326      float cos_vals[4], sin_vals[4];
 327      for(int i=0;i<4;i++)
 328      {
 329          cos_start[i] = cos(starting_phase);
 330          sin_start[i] = sin(starting_phase);
 331      }
 332  
 333      float* pdcos = d->dcos;
 334      float* pdsin = d->dsin;
 335      register float* pinput = (float*)input;
 336      register float* pinput_end = (float*)(input+input_size);
 337      register float* poutput = (float*)output;
 338  
 339      //Register map:
 340      #define RDCOS "q0" //dcos, dsin
 341      #define RDSIN "q1"
 342      #define RCOSST "q2" //cos_start, sin_start
 343      #define RSINST "q3"
 344      #define RCOSV "q4" //cos_vals, sin_vals
 345      #define RSINV "q5"
 346      #define ROUTI "q6" //output_i, output_q
 347      #define ROUTQ "q7"
 348      #define RINPI "q8" //input_i, input_q
 349      #define RINPQ "q9"
 350      #define R3(x,y,z) x ", " y ", " z "\n\t"
 351  
 352      asm volatile( //(the range of q is q0-q15)
 353          "       vld1.32 {" RDCOS "}, [%[pdcos]]\n\t"
 354          "       vld1.32 {" RDSIN "}, [%[pdsin]]\n\t"
 355          "       vld1.32 {" RCOSST "}, [%[cos_start]]\n\t"
 356          "       vld1.32 {" RSINST "}, [%[sin_start]]\n\t"
 357          "for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in RINPI and the Q samples in RINPQ), also increment the memory address in pinput (hence the "!" mark)
 358  
 359          //C version:
 360          //cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j];
 361          //sin_vals[j] = sin_start * d->dcos[j] + cos_start * d->dsin[j];
 362  
 363          "       vmul.f32 " R3(RCOSV, RCOSST, RDCOS)  //cos_vals[i] = cos_start * d->dcos[i]
 364          "       vmls.f32 " R3(RCOSV, RSINST, RDSIN)  //cos_vals[i] -= sin_start * d->dsin[i]
 365          "       vmul.f32 " R3(RSINV, RSINST, RDCOS)  //sin_vals[i] = sin_start * d->dcos[i]
 366          "       vmla.f32 " R3(RSINV, RCOSST, RDSIN)  //sin_vals[i] += cos_start * d->dsin[i]
 367  
 368          //C version:
 369          //iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
 370          //qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j);
 371          "       vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output_i =  cos_vals * input_i
 372          "       vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output_i -= sin_vals * input_q
 373          "       vmul.f32 " R3(ROUTQ, RSINV, RINPI) //output_q =  sin_vals * input_i
 374          "       vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //output_i += cos_vals * input_q
 375  
 376          "       vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]!\n\t" //store the outputs in memory
 377          //"     add %[poutput],%[poutput],#32\n\t"
 378          "       vdup.32 " RCOSST ", d9[1]\n\t" // cos_start[0-3] = cos_vals[3]
 379          "       vdup.32 " RSINST ", d11[1]\n\t" // sin_start[0-3] = sin_vals[3]
 380  
 381          "       cmp %[pinput], %[pinput_end]\n\t" //if(pinput != pinput_end)
 382          "       bcc for_addfast\n\t"              //    then goto for_addfast
 383      :
 384          [pinput]"+r"(pinput), [poutput]"+r"(poutput) //output operand list -> C variables that we will change from ASM
 385      :
 386          [pinput_end]"r"(pinput_end), [pdcos]"r"(pdcos), [pdsin]"r"(pdsin), [sin_start]"r"(sin_start), [cos_start]"r"(cos_start) //input operand list
 387      :
 388          "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list
 389      );
 390      starting_phase+=input_size*d->phase_increment;
 391      while(starting_phase>PI) starting_phase-=2*PI;
 392      while(starting_phase<-PI) starting_phase+=2*PI;
 393      return starting_phase;
 394  }
 395  
 396  #else
 397  
 398  
 399  #if 1
 400  
 401  #define SADF_L1(j) cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
 402      sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
 403  #define SADF_L2(j) iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \
 404      qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j);
 405  
 406  float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
 407  {
 408      //input_size should be multiple of 4
 409      //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
 410      float cos_start=cos(starting_phase);
 411      float sin_start=sin(starting_phase);
 412      float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
 413          sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
 414          dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
 415          dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
 416  
 417      for(int i=0;i<input_size/4; i++) //@shift_addfast_cc
 418      {
 419          SADF_L1(0)
 420          SADF_L1(1)
 421          SADF_L1(2)
 422          SADF_L1(3)
 423          SADF_L2(0)
 424          SADF_L2(1)
 425          SADF_L2(2)
 426          SADF_L2(3)
 427          cos_start = cos_vals_3;
 428          sin_start = sin_vals_3;
 429      }
 430      starting_phase+=input_size*d->phase_increment;
 431      while(starting_phase>PI) starting_phase-=2*PI;
 432      while(starting_phase<-PI) starting_phase+=2*PI;
 433      return starting_phase;
 434  }
 435  #else
 436  float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
 437  {
 438      //input_size should be multiple of 4
 439      //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
 440      float cos_start=cos(starting_phase);
 441      float sin_start=sin(starting_phase);
 442      float cos_vals[4], sin_vals[4];
 443      for(int i=0;i<input_size/4; i++) //@shift_addfast_cc
 444      {
 445          for(int j=0;j<4;j++) //@shift_addfast_cc
 446          {
 447              cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j];
 448              sin_vals[j] = sin_start * d->dcos[j] + cos_start * d->dsin[j];
 449          }
 450          for(int j=0;j<4;j++) //@shift_addfast_cc
 451          {
 452              iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
 453              qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j);
 454          }
 455          cos_start = cos_vals[3];
 456          sin_start = sin_vals[3];
 457      }
 458      starting_phase+=input_size*d->phase_increment;
 459      while(starting_phase>PI) starting_phase-=2*PI;
 460      while(starting_phase<-PI) starting_phase+=2*PI;
 461      return starting_phase;
 462  }
 463  #endif
 464  
 465  #endif
 466  
 467  #ifdef NEON_OPTS
 468  #pragma message "Manual NEON optimizations are ON: we have a faster fir_decimate_cc now."
 469  
 470  //max help: http://community.arm.com/groups/android-community/blog/2015/03/27/arm-neon-programming-quick-reference
 471  
 472  int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
 473  {
 474      //Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
 475      //It uses real taps. It returns the number of output samples actually written.
 476      //It needs overlapping input based on its returned value:
 477      //number of processed input samples = returned value * decimation factor
 478      //The output buffer should be at least input_length / 3.
 479      // i: input index | ti: tap index | oi: output index
 480      int oi=0;
 481      for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
 482      {
 483          if(i+taps_length>input_size) break;
 484          register float* pinput=(float*)&(input[i]);
 485          register float* ptaps=taps;
 486          register float* ptaps_end=taps+taps_length;
 487          float quad_acciq [8];
 488  
 489  
 490  /*
 491  q0, q1: input signal I sample and Q sample
 492  q2:     taps
 493  q4, q5: accumulator for I branch and Q branch (will be the output)
 494  */
 495  
 496          asm volatile(
 497              "       veor q4, q4\n\t"
 498              "       veor q5, q5\n\t"
 499              "for_fdccasm: vld2.32   {q0-q1}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in q0 and the Q samples in q1), also increment the memory address in pinput (hence the "!" mark) //http://community.arm.com/groups/processors/blog/2010/03/17/coding-for-neon--part-1-load-and-stores
 500              "       vld1.32 {q2}, [%[ptaps]]!\n\t"
 501              "       vmla.f32 q4, q0, q2\n\t" //quad_acc_i += quad_input_i * quad_taps_1 //http://stackoverflow.com/questions/3240440/how-to-use-the-multiply-and-accumulate-intrinsics-in-arm-cortex-a8 //http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489e/CIHEJBIE.html
 502              "       vmla.f32 q5, q1, q2\n\t" //quad_acc_q += quad_input_q * quad_taps_1
 503              "       cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps != ptaps_end)
 504              "       bcc for_fdccasm\n\t"            //  then goto for_fdcasm
 505              "       vst1.32 {q4}, [%[quad_acci]]\n\t" //if the loop is finished, store the two accumulators in memory
 506              "       vst1.32 {q5}, [%[quad_accq]]\n\t"
 507          :
 508              [pinput]"+r"(pinput), [ptaps]"+r"(ptaps) //output operand list
 509          :
 510              [ptaps_end]"r"(ptaps_end), [quad_acci]"r"(quad_acciq), [quad_accq]"r"(quad_acciq+4) //input operand list
 511          :
 512              "memory", "q0", "q1", "q2", "q4", "q5", "cc" //clobber list
 513          );
 514          //original for loops for reference:
 515          //for(int ti=0; ti<taps_length; ti++) acci += (iof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: i loop
 516          //for(int ti=0; ti<taps_length; ti++) accq += (qof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: q loop
 517  
 518          //for(int n=0;n<8;n++) fprintf(stderr, "\n>> [%d] %g \n", n, quad_acciq[n]);
 519          iof(output,oi)=quad_acciq[0]+quad_acciq[1]+quad_acciq[2]+quad_acciq[3]; //we're still not ready, as we have to add up the contents of a quad accumulator register to get a single accumulated value
 520          qof(output,oi)=quad_acciq[4]+quad_acciq[5]+quad_acciq[6]+quad_acciq[7];
 521          oi++;
 522      }
 523      return oi;
 524  }
 525  
 526  #else
 527  
 528  int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
 529  {
 530      //Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
 531      //It uses real taps. It returns the number of output samples actually written.
 532      //It needs overlapping input based on its returned value:
 533      //number of processed input samples = returned value * decimation factor
 534      //The output buffer should be at least input_length / 3.
 535      // i: input index | ti: tap index | oi: output index
 536      int oi=0;
 537      for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
 538      {
 539          if(i+taps_length>input_size) break;
 540          float acci=0;
 541          for(int ti=0; ti<taps_length; ti++) acci += (iof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: i loop
 542          float accq=0;
 543          for(int ti=0; ti<taps_length; ti++) accq += (qof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: q loop
 544          iof(output,oi)=acci;
 545          qof(output,oi)=accq;
 546          oi++;
 547      }
 548      return oi;
 549  }
 550  
 551  #endif
 552  
 553  /*
 554  int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
 555  {
 556      //Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
 557      //It uses real taps. It returns the number of output samples actually written.
 558      //It needs overlapping input based on its returned value:
 559      //number of processed input samples = returned value * decimation factor
 560      //The output buffer should be at least input_length / 3.
 561      // i: input index | ti: tap index | oi: output index
 562      int oi=0;
 563      for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
 564      {
 565          if(i+taps_length>input_size) break;
 566          float acci=0;
 567          int taps_halflength = taps_length/2;
 568          for(int ti=0; ti<taps_halflength; ti++) acci += (iof(input,i+ti)+iof(input,i+taps_length-ti)) * taps[ti]; //@fir_decimate_cc: i loop
 569          float accq=0;
 570          for(int ti=0; ti<taps_halflength; ti++) accq += (qof(input,i+ti)+qof(input,i+taps_length-ti)) * taps[ti]; //@fir_decimate_cc: q loop
 571          iof(output,oi)=acci+iof(input,i+taps_halflength)*taps[taps_halflength];
 572          qof(output,oi)=accq+qof(input,i+taps_halflength)*taps[taps_halflength];
 573          oi++;
 574      }
 575      return oi;
 576  }
 577  */
 578  
 579  int fir_interpolate_cc(complexf *input, complexf *output, int input_size, int interpolation, float *taps, int taps_length)
 580  {
 581      //i:  input index
 582      //oi: output index
 583      //ti: tap index
 584      //ti: secondary index (inside filter function)
 585      //ip: interpolation phase (0 <= ip < interpolation)
 586      int oi=0;
 587      for(int i=0; i<input_size; i++) //@fir_interpolate_cc: outer loop
 588      {
 589          if(i*interpolation + (interpolation-1) + taps_length > input_size*interpolation) break;
 590          for(int ip=0; ip<interpolation; ip++)
 591          {
 592              float acci=0;
 593              float accq=0;
 594              //int tistart = (interpolation-ip)%interpolation; 
 595              int tistart = (interpolation-ip); //why does this work? why don't we need the % part?
 596              for(int ti=tistart, si=0; ti<taps_length; (ti+=interpolation), (si++)) acci += (iof(input,i+si)) * taps[ti]; //@fir_interpolate_cc: i loop
 597              for(int ti=tistart, si=0; ti<taps_length; (ti+=interpolation), (si++)) accq += (qof(input,i+si)) * taps[ti]; //@fir_interpolate_cc: q loop
 598              iof(output,oi)=acci;
 599              qof(output,oi)=accq;
 600              oi++;
 601          }
 602      }
 603      return oi;
 604  }
 605  
 606  
 607  rational_resampler_ff_t rational_resampler_ff(float *input, float *output, int input_size, int interpolation, int decimation, float *taps, int taps_length, int last_taps_delay)
 608  {
 609  
 610      //Theory: http://www.dspguru.com/dsp/faqs/multirate/resampling
 611      //oi: output index, i: tap index
 612      int output_size=input_size*interpolation/decimation;
 613      int oi;
 614      int startingi, delayi;
 615      //fprintf(stderr,"rational_resampler_ff | interpolation = %d | decimation = %d\ntaps_length = %d | input_size = %d | output_size = %d | last_taps_delay = %d\n",interpolation,decimation,taps_length,input_size,output_size,last_taps_delay);
 616      for (oi=0; oi<output_size; oi++) //@rational_resampler_ff (outer loop)
 617      {
 618          float acc=0;
 619          startingi=(oi*decimation+interpolation-1-last_taps_delay)/interpolation; //index of first input item to apply FIR on
 620          //delayi=startingi*interpolation-oi*decimation; //delay on FIR taps
 621          delayi=(last_taps_delay+startingi*interpolation-oi*decimation)%interpolation; //delay on FIR taps
 622          if(startingi+taps_length/interpolation+1>input_size) break; //we can't compute the FIR filter to some input samples at the end
 623          //fprintf(stderr,"outer loop | oi = %d | startingi = %d | taps delay = %d\n",oi,startingi,delayi);
 624          for(int i=0; i<(taps_length-delayi)/interpolation; i++) //@rational_resampler_ff (inner loop)
 625          {
 626              //fprintf(stderr,"inner loop | input index = %d | tap index = %d | acc = %g\n",startingi+ii,i,acc);
 627              acc+=input[startingi+i]*taps[delayi+i*interpolation];
 628          }
 629          output[oi]=acc*interpolation;
 630      }
 631      rational_resampler_ff_t d;
 632      d.input_processed=startingi;
 633      d.output_size=oi;
 634      d.last_taps_delay=delayi;
 635      return d;
 636  }
 637  
 638  /*
 639  
 640  The greatest challenge in resampling is figuring out which tap should be applied to which sample.
 641  
 642  Typical test patterns for rational_resampler_ff:
 643  
 644  interpolation = 3, decimation = 1
 645  values of [oi, startingi, taps delay] in the outer loop should be:
 646  0 0 0
 647  1 1 2
 648  2 1 1
 649  3 1 0
 650  4 2 2
 651  5 2 1
 652  
 653  interpolation = 3, decimation = 2
 654  values of [oi, startingi, taps delay] in the outer loop should be:
 655  0 0 0
 656  1 1 1
 657  2 2 2
 658  3 2 0
 659  4 3 1
 660  5 4 2
 661  
 662  */
 663  
 664  
 665  void rational_resampler_get_lowpass_f(float* output, int output_size, int interpolation, int decimation, window_t window)
 666  {
 667  
 668      //See 4.1.6 at: http://www.dspguru.com/dsp/faqs/multirate/resampling
 669      float cutoff_for_interpolation=1.0/interpolation;
 670      float cutoff_for_decimation=1.0/decimation;
 671      float cutoff = (cutoff_for_interpolation<cutoff_for_decimation)?cutoff_for_interpolation:cutoff_for_decimation; //get the lower
 672      firdes_lowpass_f(output, output_size, cutoff/2, window);
 673  }
 674  
 675  float inline fir_one_pass_ff(float* input, float* taps, int taps_length)
 676  {
 677      float acc=0;
 678      for(int i=0;i<taps_length;i++) acc+=taps[i]*input[i]; //@fir_one_pass_ff
 679      return acc;
 680  }
 681  
 682  old_fractional_decimator_ff_t old_fractional_decimator_ff(float* input, float* output, int input_size, float rate, float *taps, int taps_length, old_fractional_decimator_ff_t d)
 683  {
 684      if(rate<=1.0) return d; //sanity check, can't decimate <=1.0
 685      //This routine can handle floating point decimation rates.
 686      //It linearly interpolates between two samples that are taken into consideration from the filtered input.
 687      int oi=0;
 688      int index_high;
 689      float where=d.remain;
 690      float result_high, result_low;
 691      if(where==0.0) //in the first iteration index_high may be zero (so using the item index_high-1 would lead to invalid memory access).
 692      {
 693          output[oi++]=fir_one_pass_ff(input,taps,taps_length);
 694          where+=rate;
 695      }
 696  
 697      int previous_index_high=-1;
 698      //we optimize to calculate ceilf(where) only once every iteration, so we do it here:
 699      for(;(index_high=ceilf(where))+taps_length<input_size;where+=rate) //@fractional_decimator_ff
 700      {
 701          if(previous_index_high==index_high-1) result_low=result_high; //if we step less than 2.0 then we do already have the result for the FIR filter for that index
 702          else result_low=fir_one_pass_ff(input+index_high-1,taps,taps_length);
 703          result_high=fir_one_pass_ff(input+index_high,taps,taps_length);
 704          float register rate_between_samples=where-index_high+1;
 705          output[oi++]=result_low*(1-rate_between_samples)+result_high*rate_between_samples;
 706          previous_index_high=index_high;
 707      }
 708  
 709      d.input_processed=index_high-1;
 710      d.remain=where-d.input_processed;
 711      d.output_size=oi;
 712      return d;
 713  }
 714  
 715  fractional_decimator_ff_t fractional_decimator_ff_init(float rate, int num_poly_points, float* taps, int taps_length)
 716  {
 717      fractional_decimator_ff_t d;
 718      d.num_poly_points = num_poly_points&~1; //num_poly_points needs to be even!
 719      d.poly_precalc_denomiator = (float*)malloc(d.num_poly_points*sizeof(float));
 720      //x0..x3
 721      //-1,0,1,2
 722      //-(4/2)+1
 723      //x0..x5
 724      //-2,-1,0,1,2,3
 725      d.xifirst=-(num_poly_points/2)+1, d.xilast=num_poly_points/2;
 726      int id = 0; //index in poly_precalc_denomiator
 727      for(int xi=d.xifirst;xi<=d.xilast;xi++)
 728      {
 729          d.poly_precalc_denomiator[id]=1;
 730          for(int xj=d.xifirst;xj<=d.xilast;xj++)
 731          {
 732              if(xi!=xj) d.poly_precalc_denomiator[id] *= (xi-xj); //poly_precalc_denomiator could be integer as well. But that would later add a necessary conversion.
 733          }
 734          id++;
 735      }
 736      d.where=-d.xifirst;
 737      d.coeffs_buf=(float*)malloc(d.num_poly_points*sizeof(float)); 
 738      d.filtered_buf=(float*)malloc(d.num_poly_points*sizeof(float)); 
 739      //d.last_inputs_circbuf = (float)malloc(d.num_poly_points*sizeof(float));
 740      //d.last_inputs_startsat = 0; 
 741      //d.last_inputs_samplewhere = -1;
 742      //for(int i=0;i<num_poly_points; i++) d.last_inputs_circbuf[i] = 0;
 743      d.rate = rate;
 744      d.taps = taps;
 745      d.taps_length = taps_length;
 746      d.input_processed = 0;
 747      return d;
 748  }
 749  
 750  #define DEBUG_ASSERT 1
 751  void fractional_decimator_ff(float* input, float* output, int input_size, fractional_decimator_ff_t* d)
 752  {
 753      //This routine can handle floating point decimation rates.
 754      //It applies polynomial interpolation to samples that are taken into consideration from a pre-filtered input.
 755      //The pre-filter can be switched off by applying taps=NULL.
 756      //fprintf(stderr, "drate=%f\n", d->rate);
 757      if(DEBUG_ASSERT) assert(d->rate > 1.0); 
 758      if(DEBUG_ASSERT) assert(d->where >= -d->xifirst);
 759      int oi=0; //output index
 760      int index_high; 
 761  #define FD_INDEX_LOW (index_high-1)
 762      //we optimize to calculate ceilf(where) only once every iteration, so we do it here:
 763      for(;(index_high=ceilf(d->where))+d->num_poly_points+d->taps_length<input_size;d->where+=d->rate) //@fractional_decimator_ff
 764      {
 765          //d->num_poly_points above is theoretically more than we could have here, but this makes the spectrum look good
 766          int sxifirst = FD_INDEX_LOW + d->xifirst; 
 767          int sxilast = FD_INDEX_LOW + d->xilast; 
 768          if(d->taps) 
 769              for(int wi=0;wi<d->num_poly_points;wi++) d->filtered_buf[wi] = fir_one_pass_ff(input+FD_INDEX_LOW+wi, d->taps, d->taps_length);
 770          else
 771              for(int wi=0;wi<d->num_poly_points;wi++) d->filtered_buf[wi] = *(input+FD_INDEX_LOW+wi);
 772          int id=0;
 773          float xwhere = d->where - FD_INDEX_LOW;
 774          for(int xi=d->xifirst;xi<=d->xilast;xi++)
 775          {
 776              d->coeffs_buf[id]=1;
 777              for(int xj=d->xifirst;xj<=d->xilast;xj++)
 778              {
 779                  if(xi!=xj) d->coeffs_buf[id] *= (xwhere-xj);
 780              }
 781              id++;       
 782          }
 783          float acc = 0;
 784          for(int i=0;i<d->num_poly_points;i++)
 785          {
 786              acc += (d->coeffs_buf[i]/d->poly_precalc_denomiator[i])*d->filtered_buf[i];  //(xnom/xden)*yn
 787          }
 788          output[oi++]=acc;
 789      }
 790      d->input_processed = FD_INDEX_LOW + d->xifirst;
 791      d->where -= d->input_processed;
 792      d->output_size = oi;
 793  }
 794  
 795  /*
 796   * Some notes to myself on the circular buffer I wanted to implement here:
 797          int last_input_samplewhere_shouldbe = (index_high-1)+xifirst;
 798          int last_input_offset = last_input_samplewhere_shouldbe - d->last_input_samplewhere;
 799          if(last_input_offset < num_poly_points)
 800          {
 801              //if we can move the last_input circular buffer, we move, and add the new samples at the end
 802              d->last_inputs_startsat += last_input_offset;
 803              d->last_inputs_startsat %= num_poly_points;
 804              int num_copied_samples = 0;
 805              for(int i=0; i<last_input_offset; i++)
 806              {
 807                  d->last_inputs_circbuf[i]=
 808              }
 809              d->last_input_samplewhere = d->las
 810          }
 811      However, I think I should just rather do a continuous big buffer.
 812  */
 813  
 814  void apply_fir_fft_cc(FFT_PLAN_T* plan, FFT_PLAN_T* plan_inverse, complexf* taps_fft, complexf* last_overlap, int overlap_size)
 815  {
 816      //use the overlap & add method for filtering
 817  
 818      //calculate FFT on input buffer
 819      fft_execute(plan);
 820  
 821      //multiply the filter and the input
 822      complexf* in = plan->output;
 823      complexf* out = plan_inverse->input;
 824  
 825      for(int i=0;i<plan->size;i++) //@apply_fir_fft_cc: multiplication
 826      {
 827          iof(out,i)=iof(in,i)*iof(taps_fft,i)-qof(in,i)*qof(taps_fft,i);
 828          qof(out,i)=iof(in,i)*qof(taps_fft,i)+qof(in,i)*iof(taps_fft,i);
 829      }
 830  
 831      //calculate inverse FFT on multiplied buffer
 832      fft_execute(plan_inverse);
 833  
 834      //add the overlap of the previous segment
 835      complexf* result = plan_inverse->output;
 836  
 837      for(int i=0;i<plan->size;i++) //@apply_fir_fft_cc: normalize by fft_size
 838      {
 839          iof(result,i)/=plan->size;
 840          qof(result,i)/=plan->size;
 841      }
 842  
 843      for(int i=0;i<overlap_size;i++) //@apply_fir_fft_cc: add overlap
 844      {
 845          iof(result,i)=iof(result,i)+iof(last_overlap,i);
 846          qof(result,i)=qof(result,i)+qof(last_overlap,i);
 847      }
 848  
 849  }
 850  
 851  /*
 852             __  __       _                          _       _       _
 853       /\   |  \/  |     | |                        | |     | |     | |
 854      /  \  | \  / |   __| | ___ _ __ ___   ___   __| |_   _| | __ _| |_ ___  _ __ ___
 855     / /\ \ | |\/| |  / _` |/ _ \ '_ ` _ \ / _ \ / _` | | | | |/ _` | __/ _ \| '__/ __|
 856    / ____ \| |  | | | (_| |  __/ | | | | | (_) | (_| | |_| | | (_| | || (_) | |  \__ \
 857   /_/    \_\_|  |_|  \__,_|\___|_| |_| |_|\___/ \__,_|\__,_|_|\__,_|\__\___/|_|  |___/
 858  
 859  */
 860  
 861  void amdemod_cf(complexf* input, float *output, int input_size)
 862  {
 863      //@amdemod: i*i+q*q
 864      for (int i=0; i<input_size; i++)
 865      {
 866          output[i]=iof(input,i)*iof(input,i)+qof(input,i)*qof(input,i);
 867      }
 868      //@amdemod: sqrt
 869      for (int i=0; i<input_size; i++)
 870      {
 871          output[i]=sqrt(output[i]);
 872      }
 873  }
 874  
 875  void amdemod_estimator_cf(complexf* input, float *output, int input_size, float alpha, float beta)
 876  {
 877      //concept is explained here:
 878      //http://www.dspguru.com/dsp/tricks/magnitude-estimator
 879  
 880      //default: optimize for min RMS error
 881      if(alpha==0)
 882      {
 883          alpha=0.947543636291;
 884          beta=0.392485425092;
 885      }
 886  
 887      //@amdemod_estimator
 888      for (int i=0; i<input_size; i++)
 889      {
 890          float abs_i=iof(input,i);
 891          if(abs_i<0) abs_i=-abs_i;
 892          float abs_q=qof(input,i);
 893          if(abs_q<0) abs_q=-abs_q;
 894          float max_iq=abs_i;
 895          if(abs_q>max_iq) max_iq=abs_q;
 896          float min_iq=abs_i;
 897          if(abs_q<min_iq) min_iq=abs_q;
 898  
 899          output[i]=alpha*max_iq+beta*min_iq;
 900      }
 901  }
 902  
 903  dcblock_preserve_t dcblock_ff(float* input, float* output, int input_size, float a, dcblock_preserve_t preserved)
 904  {
 905      //after AM demodulation, a DC blocking filter should be used to remove the DC component from the signal.
 906      //Concept: http://peabody.sapp.org/class/dmp2/lab/dcblock/
 907      //output size equals to input_size;
 908      //preserve can be initialized to zero on first run.
 909      if(a==0) a=0.999; //default value, simulate in octave: freqz([1 -1],[1 -0.99])
 910      output[0]=input[0]-preserved.last_input+a*preserved.last_output;
 911      for(int i=1; i<input_size; i++) //@dcblock_f
 912      {
 913          output[i]=input[i]-input[i-1]+a*output[i-1];
 914      }
 915      preserved.last_input=input[input_size-1];
 916      preserved.last_output=output[input_size-1];
 917      return preserved;
 918  }
 919  
 920  float fastdcblock_ff(float* input, float* output, int input_size, float last_dc_level)
 921  {
 922      //this DC block filter does moving average block-by-block.
 923      //this is the most computationally efficient
 924      //input and output buffer is allowed to be the same
 925      //http://www.digitalsignallabs.com/dcblock.pdf
 926      float avg=0.0;
 927      for(int i=0;i<input_size;i++) //@fastdcblock_ff: calculate block average
 928      {
 929          avg+=input[i];
 930      }
 931      avg/=input_size;
 932  
 933      float avgdiff=avg-last_dc_level;
 934      //DC removal level will change lineraly from last_dc_level to avg.
 935      for(int i=0;i<input_size;i++) //@fastdcblock_ff: remove DC component
 936      {
 937          float dc_removal_level=last_dc_level+avgdiff*((float)i/input_size);
 938          output[i]=input[i]-dc_removal_level;
 939      }
 940      return avg;
 941  }
 942  
 943  //#define FASTAGC_MAX_GAIN (65e3)
 944  #define FASTAGC_MAX_GAIN 50
 945  
 946  void fastagc_ff(fastagc_ff_t* input, float* output)
 947  {
 948      //Gain is processed on blocks of samples.
 949      //You have to supply three blocks of samples before the first block comes out.
 950      //AGC reaction speed equals input_size*samp_rate*2
 951  
 952      //The algorithm calculates target gain at the end of the first block out of the peak value of all the three blocks.
 953      //This way the gain change can easily react if there is any peak in the third block.
 954      //Pros: can be easily speeded up with loop vectorization, easy to implement
 955      //Cons: needs 3 buffers, dos not behave similarly to real AGC circuits
 956  
 957      //Get the peak value of new input buffer
 958      float peak_input=0;
 959      for(int i=0;i<input->input_size;i++) //@fastagc_ff: peak search
 960      {
 961          float val=fabs(input->buffer_input[i]);
 962          if(val>peak_input) peak_input=val;
 963      }
 964  
 965      //Determine the maximal peak out of the three blocks
 966      float target_peak=peak_input;
 967      if(target_peak<input->peak_2) target_peak=input->peak_2;
 968      if(target_peak<input->peak_1) target_peak=input->peak_1;
 969  
 970      //we change the gain linearly on the apply_block from the last_gain to target_gain.
 971      float target_gain=input->reference/target_peak;
 972      if(target_gain>FASTAGC_MAX_GAIN) target_gain=FASTAGC_MAX_GAIN;
 973      //fprintf(stderr, "target_gain: %g\n",target_gain);
 974  
 975      for(int i=0;i<input->input_size;i++) //@fastagc_ff: apply gain
 976      {
 977          float rate=(float)i/input->input_size;
 978          float gain=input->last_gain*(1.0-rate)+target_gain*rate;
 979          output[i]=input->buffer_1[i]*gain;
 980      }
 981  
 982      //Shift the three buffers
 983      float* temp_pointer=input->buffer_1;
 984      input->buffer_1=input->buffer_2;
 985      input->peak_1=input->peak_2;
 986      input->buffer_2=input->buffer_input;
 987      input->peak_2=peak_input;
 988      input->buffer_input=temp_pointer;
 989      input->last_gain=target_gain;
 990      //fprintf(stderr,"target_gain=%g\n", target_gain);
 991  }
 992  
 993  /*
 994    ______ __  __        _                          _       _       _
 995   |  ____|  \/  |      | |                        | |     | |     | |
 996   | |__  | \  / |    __| | ___ _ __ ___   ___   __| |_   _| | __ _| |_ ___  _ __ ___
 997   |  __| | |\/| |   / _` |/ _ \ '_ ` _ \ / _ \ / _` | | | | |/ _` | __/ _ \| '__/ __|
 998   | |    | |  | |  | (_| |  __/ | | | | | (_) | (_| | |_| | | (_| | || (_) | |  \__ \
 999   |_|    |_|  |_|   \__,_|\___|_| |_| |_|\___/ \__,_|\__,_|_|\__,_|\__\___/|_|  |___/
1000  
1001  */
1002  
1003  
1004  float fmdemod_atan_cf(complexf* input, float *output, int input_size, float last_phase)
1005  {
1006      //GCC most likely won't vectorize nor atan, nor atan2.
1007      //For more comments, look at: https://github.com/simonyiszk/minidemod/blob/master/minidemod-wfm-atan.c
1008      float phase, dphase;
1009      for (int i=0; i<input_size; i++) //@fmdemod_atan_novect
1010      {
1011          phase=argof(input,i);
1012          dphase=phase-last_phase;
1013          if(dphase<-PI) dphase+=2*PI;
1014          if(dphase>PI) dphase-=2*PI;
1015          output[i]=dphase/PI;
1016          last_phase=phase;
1017      }
1018      return last_phase;
1019  }
1020  
1021  #define fmdemod_quadri_K 0.340447550238101026565118445432744920253753662109375
1022  //this constant ensures proper scaling for qa_fmemod testcases for SNR calculation and more.
1023  
1024  complexf fmdemod_quadri_novect_cf(complexf* input, float* output, int input_size, complexf last_sample)
1025  {
1026      output[0]=fmdemod_quadri_K*(iof(input,0)*(qof(input,0)-last_sample.q)-qof(input,0)*(iof(input,0)-last_sample.i))/(iof(input,0)*iof(input,0)+qof(input,0)*qof(input,0));
1027      for (int i=1; i<input_size; i++) //@fmdemod_quadri_novect_cf
1028      {
1029          float qnow=qof(input,i);
1030          float qlast=qof(input,i-1);
1031          float inow=iof(input,i);
1032          float ilast=iof(input,i-1);
1033          output[i]=fmdemod_quadri_K*(inow*(qnow-qlast)-qnow*(inow-ilast))/(inow*inow+qnow*qnow);
1034          //TODO: expression can be simplified as: (qnow*ilast-inow*qlast)/(inow*inow+qnow*qnow)
1035      }
1036      return input[input_size-1];
1037  }
1038  
1039  
1040  complexf fmdemod_quadri_cf(complexf* input, float* output, int input_size, float *temp, complexf last_sample)
1041  {
1042      float* temp_dq=temp;
1043      float* temp_di=temp+input_size;
1044  
1045      temp_dq[0]=qof(input,0)-last_sample.q;
1046      for (int i=1; i<input_size; i++) //@fmdemod_quadri_cf: dq
1047      {
1048          temp_dq[i]=qof(input,i)-qof(input,i-1);
1049      }
1050  
1051      temp_di[0]=iof(input,0)-last_sample.i;
1052      for (int i=1; i<input_size; i++) //@fmdemod_quadri_cf: di
1053      {
1054          temp_di[i]=iof(input,i)-iof(input,i-1);
1055      }
1056  
1057      for (int i=0; i<input_size; i++) //@fmdemod_quadri_cf: output numerator
1058      {
1059          output[i]=(iof(input,i)*temp_dq[i]-qof(input,i)*temp_di[i]);
1060      }
1061      for (int i=0; i<input_size; i++) //@fmdemod_quadri_cf: output denomiator
1062      {
1063          temp[i]=iof(input,i)*iof(input,i)+qof(input,i)*qof(input,i);
1064      }
1065      for (int i=0; i<input_size; i++) //@fmdemod_quadri_cf: output division
1066      {
1067          output[i]=(temp[i])?fmdemod_quadri_K*output[i]/temp[i]:0;
1068      }
1069  
1070      return input[input_size-1];
1071  }
1072  
1073  inline int is_nan(float f)
1074  {
1075      //http://stackoverflow.com/questions/570669/checking-if-a-double-or-float-is-nan-in-c
1076      unsigned u = *(unsigned*)&f;
1077      return (u&0x7F800000) == 0x7F800000 && (u&0x7FFFFF); // Both NaN and qNan.
1078  }
1079  
1080  
1081  float deemphasis_wfm_ff (float* input, float* output, int input_size, float tau, int sample_rate, float last_output)
1082  {
1083      /*
1084          typical time constant (tau) values:
1085          WFM transmission in USA: 75 us -> tau = 75e-6
1086          WFM transmission in EU:  50 us -> tau = 50e-6
1087          More info at: http://www.cliftonlaboratories.com/fm_receivers_and_de-emphasis.htm
1088          Simulate in octave: tau=75e-6; dt=1/48000; alpha = dt/(tau+dt); freqz([alpha],[1 -(1-alpha)])
1089      */
1090      float dt = 1.0/sample_rate;
1091      float alpha = dt/(tau+dt);
1092      if(is_nan(last_output)) last_output=0.0; //if last_output is NaN
1093      output[0]=alpha*input[0]+(1-alpha)*last_output;
1094      for (int i=1;i<input_size;i++) //@deemphasis_wfm_ff
1095         output[i]=alpha*input[i]+(1-alpha)*output[i-1]; //this is the simplest IIR LPF
1096      return output[input_size-1];
1097  }
1098  
1099  #define DNFMFF_ADD_ARRAY(x) if(sample_rate==x) { taps=deemphasis_nfm_predefined_fir_##x; taps_length=sizeof(deemphasis_nfm_predefined_fir_##x)/sizeof(float); }
1100  
1101  int deemphasis_nfm_ff (float* input, float* output, int input_size, int sample_rate)
1102  {
1103      /*
1104          Warning! This only works on predefined samplerates, as it uses fixed FIR coefficients defined in predefined.h
1105          However, there are the octave commands to generate the taps for your custom (fixed) sample rate.
1106          What it does:
1107              - reject below 400 Hz
1108              - passband between 400 HZ - 4 kHz, but with 20 dB/decade rolloff (for deemphasis)
1109              - reject everything above 4 kHz
1110      */
1111      float* taps;
1112      int taps_length=0;
1113  
1114      DNFMFF_ADD_ARRAY(48000)
1115      DNFMFF_ADD_ARRAY(44100)
1116      DNFMFF_ADD_ARRAY(8000)
1117      DNFMFF_ADD_ARRAY(11025)
1118  
1119      if(!taps_length) return 0; //sample rate n
1120      int i;
1121      for(i=0;i<input_size-taps_length;i++) //@deemphasis_nfm_ff: outer loop
1122      {
1123          float acc=0;
1124          for(int ti=0;ti<taps_length;ti++) acc+=taps[ti]*input[i+ti]; //@deemphasis_nfm_ff: inner loop
1125          output[i]=acc;
1126      }
1127      return i; //number of samples processed (and output samples)
1128  }
1129  
1130  void limit_ff(float* input, float* output, int input_size, float max_amplitude)
1131  {
1132      for (int i=0; i<input_size; i++) //@limit_ff
1133      {
1134          output[i]=(max_amplitude<input[i])?max_amplitude:input[i];
1135          output[i]=(-max_amplitude>output[i])?-max_amplitude:output[i];
1136      }
1137  }
1138  
1139  void gain_ff(float* input, float* output, int input_size, float gain)
1140  {
1141      for(int i=0;i<input_size;i++) output[i]=gain*input[i]; //@gain_ff
1142  }
1143  
1144  float get_power_f(float* input, int input_size, int decimation)
1145  {
1146      float acc = 0;
1147      for(int i=0;i<input_size;i+=decimation)
1148      {
1149          acc += (input[i]*input[i])/input_size;
1150      }
1151      return acc;
1152  }
1153  
1154  float get_power_c(complexf* input, int input_size, int decimation)
1155  {
1156      float acc = 0;
1157      for(int i=0;i<input_size;i+=decimation)
1158      {
1159          acc += (iof(input,i)*iof(input,i)+qof(input,i)*qof(input,i))/input_size;
1160      }
1161      return acc;
1162  }
1163  
1164  /*
1165    __  __           _       _       _ 
1166   |  \/  |         | |     | |     |  |
1167   | \  / | ___   __| |_   _| | __ _|  |_ ___  _ __ ___
1168   | |\/| |/ _ \ / _` | | | | |/ _` |  __/ _ \| '__/ __|
1169   | |  | | (_) | (_| | |_| | | (_| |  || (_) | |  \__ \
1170   |_|  |_|\___/ \__,_|\__,_|_|\__,_|\ __\___/|_|  |___/
1171  
1172  */
1173  
1174  void add_dcoffset_cc(complexf* input, complexf* output, int input_size)
1175  {
1176      for(int i=0;i<input_size;i++) iof(output,i)=0.5+iof(input,i)/2;
1177      for(int i=0;i<input_size;i++) qof(output,i)=qof(input,i)/2;
1178  }
1179  
1180  float fmmod_fc(float* input, complexf* output, int input_size, float last_phase)
1181  {
1182      float phase=last_phase;
1183      for(int i=0;i<input_size;i++)
1184      {
1185          phase+=input[i]*PI;
1186          while(phase>PI) phase-=2*PI;
1187          while(phase<=-PI) phase+=2*PI;
1188          iof(output,i)=cos(phase);
1189          qof(output,i)=sin(phase);
1190      }
1191      return phase;
1192  }
1193  
1194  void fixed_amplitude_cc(complexf* input, complexf* output, int input_size, float new_amplitude)
1195  {
1196      for(int i=0;i<input_size;i++)
1197      {
1198          //float phase=atan2(iof(input,i),qof(input,i));
1199          //iof(output,i)=cos(phase)*amp;
1200          //qof(output,i)=sin(phase)*amp;
1201  
1202          //A faster solution:
1203          float amplitude_now = sqrt(iof(input,i)*iof(input,i)+qof(input,i)*qof(input,i));
1204          float gain = (amplitude_now > 0) ? new_amplitude / amplitude_now : 0;
1205          iof(output,i)=iof(input,i)*gain;
1206          qof(output,i)=qof(input,i)*gain;
1207      }
1208  }
1209  
1210  /*
1211    ______        _     ______               _             _______                   __
1212   |  ____|      | |   |  ____|             (_)           |__   __|                 / _|
1213   | |__ __ _ ___| |_  | |__ ___  _   _ _ __ _  ___ _ __     | |_ __ __ _ _ __  ___| |_ ___  _ __ _ __ ___
1214   |  __/ _` / __| __| |  __/ _ \| | | | '__| |/ _ \ '__|    | | '__/ _` | '_ \/ __|  _/ _ \| '__| '_ ` _ \
1215   | | | (_| \__ \ |_  | | | (_) | |_| | |  | |  __/ |       | | | | (_| | | | \__ \ || (_) | |  | | | | | |
1216   |_|  \__,_|___/\__| |_|  \___/ \__,_|_|  |_|\___|_|       |_|_|  \__,_|_| |_|___/_| \___/|_|  |_| |_| |_|
1217  
1218  */
1219  
1220  int log2n(int x)
1221  {
1222      int result=-1;
1223      for(int i=0;i<31;i++)
1224      {
1225          if((x>>i)&1) //@@log2n
1226          {
1227              if (result==-1) result=i;
1228              else return -1;
1229          }
1230      }
1231      return result;
1232  }
1233  
1234  int next_pow2(int x)
1235  {
1236      int pow2;
1237      //portability? (31 is the problem)
1238      for(int i=0;i<31;i++)
1239      {
1240          if(x<(pow2=1<<i)) return pow2; //@@next_pow2
1241      }
1242      return -1;
1243  }
1244  
1245  void apply_window_c(complexf* input, complexf* output, int size, window_t window)
1246  {
1247      float (*window_function)(float)=firdes_get_window_kernel(window);
1248      for(int i=0;i<size;i++) //@apply_window_c
1249      {
1250          float rate=(float)i/(size-1);
1251          iof(output,i)=iof(input,i)*window_function(2.0*rate+1.0);
1252          qof(output,i)=qof(input,i)*window_function(2.0*rate+1.0);
1253      }
1254  }
1255  
1256  float *precalculate_window(int size, window_t window)
1257  {
1258      float (*window_function)(float)=firdes_get_window_kernel(window);
1259      float *windowt;
1260      windowt = malloc(sizeof(float) * size);
1261      for(int i=0;i<size;i++) //@precalculate_window
1262      {
1263          float rate=(float)i/(size-1);
1264          windowt[i] = window_function(2.0*rate+1.0);
1265      }
1266      return windowt;
1267  }
1268  
1269  void apply_precalculated_window_c(complexf* input, complexf* output, int size, float *windowt)
1270  {
1271      for(int i=0;i<size;i++) //@apply_precalculated_window_c
1272      {
1273          iof(output,i)=iof(input,i)*windowt[i];
1274          qof(output,i)=qof(input,i)*windowt[i];
1275      }
1276  }
1277  
1278  void apply_precalculated_window_f(float* input, float* output, int size, float *windowt)
1279  {
1280  	for(int i=0;i<size;i++) //@apply_precalculated_window_f
1281  	{
1282  		output[i] = input[i] * windowt[i];
1283  	}
1284  }
1285  
1286  void apply_window_f(float* input, float* output, int size, window_t window)
1287  {
1288      float (*window_function)(float)=firdes_get_window_kernel(window);
1289      for(int i=0;i<size;i++) //@apply_window_f
1290      {
1291          float rate=(float)i/(size-1);
1292          output[i]=input[i]*window_function(2.0*rate+1.0);
1293      }
1294  }
1295  
1296  void logpower_cf(complexf* input, float* output, int size, float add_db)
1297  {
1298      for(int i=0;i<size;i++) output[i]=iof(input,i)*iof(input,i) + qof(input,i)*qof(input,i); //@logpower_cf: pass 1
1299  
1300      for(int i=0;i<size;i++) output[i]=log10(output[i]); //@logpower_cf: pass 2
1301  
1302      for(int i=0;i<size;i++) output[i]=10*output[i]+add_db; //@logpower_cf: pass 3
1303  }
1304  
1305  void accumulate_power_cf(complexf* input, float* output, int size)
1306  {
1307      for(int i=0;i<size;i++) output[i] += iof(input,i)*iof(input,i) + qof(input,i)*qof(input,i); //@logpower_cf: pass 1
1308  }
1309  
1310  void log_ff(float* input, float* output, int size, float add_db) {
1311      for(int i=0;i<size;i++) output[i]=log10(input[i]); //@logpower_cf: pass 2
1312  
1313      for(int i=0;i<size;i++) output[i]=10*output[i]+add_db; //@logpower_cf: pass 3
1314  }
1315  
1316  float total_logpower_cf(complexf* input, int input_size)
1317  {
1318      float acc = 0; 
1319      for(int i=0;i<input_size;i++) acc+=(iof(input,i)*iof(input,i) + qof(input,i)*qof(input,i));
1320      return 10*log10(acc/input_size);
1321  }
1322  
1323  /*
1324    _____  _       _ _        _       _                          _
1325   |  __ \(_)     (_) |      | |     | |                        | |
1326   | |  | |_  __ _ _| |_ __ _| |   __| | ___ _ __ ___   ___   __| |
1327   | |  | | |/ _` | | __/ _` | |  / _` |/ _ \ '_ ` _ \ / _ \ / _` |
1328   | |__| | | (_| | | || (_| | | | (_| |  __/ | | | | | (_) | (_| |
1329   |_____/|_|\__, |_|\__\__,_|_|  \__,_|\___|_| |_| |_|\___/ \__,_|
1330              __/ |
1331             |___/
1332  */
1333  
1334  psk31_varicode_item_t psk31_varicode_items[] =
1335  {
1336      { .code = 0b1010101011, .bitcount=10,   .ascii=0x00 }, //NUL, null
1337      { .code = 0b1011011011, .bitcount=10,   .ascii=0x01 }, //SOH, start of heading
1338      { .code = 0b1011101101, .bitcount=10,   .ascii=0x02 }, //STX, start of text
1339      { .code = 0b1101110111, .bitcount=10,   .ascii=0x03 }, //ETX, end of text
1340      { .code = 0b1011101011, .bitcount=10,   .ascii=0x04 }, //EOT, end of transmission
1341      { .code = 0b1101011111, .bitcount=10,   .ascii=0x05 }, //ENQ, enquiry
1342      { .code = 0b1011101111, .bitcount=10,   .ascii=0x06 }, //ACK, acknowledge
1343      { .code = 0b1011111101, .bitcount=10,   .ascii=0x07 }, //BEL, bell
1344      { .code = 0b1011111111, .bitcount=10,   .ascii=0x08 }, //BS, backspace
1345      { .code = 0b11101111,   .bitcount=8,    .ascii=0x09 }, //TAB, horizontal tab
1346      { .code = 0b11101,      .bitcount=5,    .ascii=0x0a }, //LF, NL line feed, new line
1347      { .code = 0b1101101111, .bitcount=10,   .ascii=0x0b }, //VT, vertical tab
1348      { .code = 0b1011011101, .bitcount=10,   .ascii=0x0c }, //FF, NP form feed, new page
1349      { .code = 0b11111,      .bitcount=5,    .ascii=0x0d }, //CR, carriage return (overwrite)
1350      { .code = 0b1101110101, .bitcount=10,   .ascii=0x0e }, //SO, shift out
1351      { .code = 0b1110101011, .bitcount=10,   .ascii=0x0f }, //SI, shift in
1352      { .code = 0b1011110111, .bitcount=10,   .ascii=0x10 }, //DLE, data link escape
1353      { .code = 0b1011110101, .bitcount=10,   .ascii=0x11 }, //DC1, device control 1
1354      { .code = 0b1110101101, .bitcount=10,   .ascii=0x12 }, //DC2, device control 2
1355      { .code = 0b1110101111, .bitcount=10,   .ascii=0x13 }, //DC3, device control 3
1356      { .code = 0b1101011011, .bitcount=10,   .ascii=0x14 }, //DC4, device control 4
1357      { .code = 0b1101101011, .bitcount=10,   .ascii=0x15 }, //NAK, negative acknowledge
1358      { .code = 0b1101101101, .bitcount=10,   .ascii=0x16 }, //SYN, synchronous idle
1359      { .code = 0b1101010111, .bitcount=10,   .ascii=0x17 }, //ETB, end of trans. block
1360      { .code = 0b1101111011, .bitcount=10,   .ascii=0x18 }, //CAN, cancel
1361      { .code = 0b1101111101, .bitcount=10,   .ascii=0x19 }, //EM, end of medium
1362      { .code = 0b1110110111, .bitcount=10,   .ascii=0x1a }, //SUB, substitute
1363      { .code = 0b1101010101, .bitcount=10,   .ascii=0x1b }, //ESC, escape
1364      { .code = 0b1101011101, .bitcount=10,   .ascii=0x1c }, //FS, file separator
1365      { .code = 0b1110111011, .bitcount=10,   .ascii=0x1d }, //GS, group separator
1366      { .code = 0b1011111011, .bitcount=10,   .ascii=0x1e }, //RS, record separator
1367      { .code = 0b1101111111, .bitcount=10,   .ascii=0x1f }, //US, unit separator
1368      { .code = 0b1,          .bitcount=1,    .ascii=0x20 }, //szóköz
1369      { .code = 0b111111111,  .bitcount=9,    .ascii=0x21 }, //!
1370      { .code = 0b101011111,  .bitcount=9,    .ascii=0x22 }, //"
1371      { .code = 0b111110101,  .bitcount=9,    .ascii=0x23 }, //#
1372      { .code = 0b111011011,  .bitcount=9,    .ascii=0x24 }, //$
1373      { .code = 0b1011010101, .bitcount=10,   .ascii=0x25 }, //%
1374      { .code = 0b1010111011, .bitcount=10,   .ascii=0x26 }, //&
1375      { .code = 0b101111111,  .bitcount=9,    .ascii=0x27 }, //'
1376      { .code = 0b11111011,   .bitcount=8,    .ascii=0x28 }, //(
1377      { .code = 0b11110111,   .bitcount=8,    .ascii=0x29 }, //)
1378      { .code = 0b101101111,  .bitcount=9,    .ascii=0x2a }, //*
1379      { .code = 0b111011111,  .bitcount=9,    .ascii=0x2b }, //+
1380      { .code = 0b1110101,    .bitcount=7,    .ascii=0x2c }, //,
1381      { .code = 0b110101,     .bitcount=6,    .ascii=0x2d }, //-
1382      { .code = 0b1010111,    .bitcount=7,    .ascii=0x2e }, //.
1383      { .code = 0b110101111,  .bitcount=9,    .ascii=0x2f }, ///
1384      { .code = 0b10110111,   .bitcount=8,    .ascii=0x30 }, //0
1385      { .code = 0b10111101,   .bitcount=8,    .ascii=0x31 }, //1
1386      { .code = 0b11101101,   .bitcount=8,    .ascii=0x32 }, //2
1387      { .code = 0b11111111,   .bitcount=8,    .ascii=0x33 }, //3
1388      { .code = 0b101110111,  .bitcount=9,    .ascii=0x34 }, //4
1389      { .code = 0b101011011,  .bitcount=9,    .ascii=0x35 }, //5
1390      { .code = 0b101101011,  .bitcount=9,    .ascii=0x36 }, //6
1391      { .code = 0b110101101,  .bitcount=9,    .ascii=0x37 }, //7
1392      { .code = 0b110101011,  .bitcount=9,    .ascii=0x38 }, //8
1393      { .code = 0b110110111,  .bitcount=9,    .ascii=0x39 }, //9
1394      { .code = 0b11110101,   .bitcount=8,    .ascii=0x3a }, //:
1395      { .code = 0b110111101,  .bitcount=9,    .ascii=0x3b }, //;
1396      { .code = 0b111101101,  .bitcount=9,    .ascii=0x3c }, //<
1397      { .code = 0b1010101,    .bitcount=7,    .ascii=0x3d }, //=
1398      { .code = 0b111010111,  .bitcount=9,    .ascii=0x3e }, //>
1399      { .code = 0b1010101111, .bitcount=10,   .ascii=0x3f }, //?
1400      { .code = 0b1010111101, .bitcount=10,   .ascii=0x40 }, //@
1401      { .code = 0b1111101,    .bitcount=7,    .ascii=0x41 }, //A
1402      { .code = 0b11101011,   .bitcount=8,    .ascii=0x42 }, //B
1403      { .code = 0b10101101,   .bitcount=8,    .ascii=0x43 }, //C
1404      { .code = 0b10110101,   .bitcount=8,    .ascii=0x44 }, //D
1405      { .code = 0b1110111,    .bitcount=7,    .ascii=0x45 }, //E
1406      { .code = 0b11011011,   .bitcount=8,    .ascii=0x46 }, //F
1407      { .code = 0b11111101,   .bitcount=8,    .ascii=0x47 }, //G
1408      { .code = 0b101010101,  .bitcount=9,    .ascii=0x48 }, //H
1409      { .code = 0b1111111,    .bitcount=7,    .ascii=0x49 }, //I
1410      { .code = 0b111111101,  .bitcount=9,    .ascii=0x4a }, //J
1411      { .code = 0b101111101,  .bitcount=9,    .ascii=0x4b }, //K
1412      { .code = 0b11010111,   .bitcount=8,    .ascii=0x4c }, //L
1413      { .code = 0b10111011,   .bitcount=8,    .ascii=0x4d }, //M
1414      { .code = 0b11011101,   .bitcount=8,    .ascii=0x4e }, //N
1415      { .code = 0b10101011,   .bitcount=8,    .ascii=0x4f }, //O
1416      { .code = 0b11010101,   .bitcount=8,    .ascii=0x50 }, //P
1417      { .code = 0b111011101,  .bitcount=9,    .ascii=0x51 }, //Q
1418      { .code = 0b10101111,   .bitcount=8,    .ascii=0x52 }, //R
1419      { .code = 0b1101111,    .bitcount=7,    .ascii=0x53 }, //S
1420      { .code = 0b1101101,    .bitcount=7,    .ascii=0x54 }, //T
1421      { .code = 0b101010111,  .bitcount=9,    .ascii=0x55 }, //U
1422      { .code = 0b110110101,  .bitcount=9,    .ascii=0x56 }, //V
1423      { .code = 0b101011101,  .bitcount=9,    .ascii=0x57 }, //W
1424      { .code = 0b101110101,  .bitcount=9,    .ascii=0x58 }, //X
1425      { .code = 0b101111011,  .bitcount=9,    .ascii=0x59 }, //Y
1426      { .code = 0b1010101101, .bitcount=10,   .ascii=0x5a }, //Z
1427      { .code = 0b111110111,  .bitcount=9,    .ascii=0x5b }, //[
1428      { .code = 0b111101111,  .bitcount=9,    .ascii=0x5c }, //backslash
1429      { .code = 0b111111011,  .bitcount=9,    .ascii=0x5d }, //]
1430      { .code = 0b1010111111, .bitcount=10,   .ascii=0x5e }, //^
1431      { .code = 0b101101101,  .bitcount=9,    .ascii=0x5f }, //_
1432      { .code = 0b1011011111, .bitcount=10,   .ascii=0x60 }, //`
1433      { .code = 0b1011,       .bitcount=4,    .ascii=0x61 }, //a
1434      { .code = 0b1011111,    .bitcount=7,    .ascii=0x62 }, //b
1435      { .code = 0b101111,     .bitcount=6,    .ascii=0x63 }, //c
1436      { .code = 0b101101,     .bitcount=6,    .ascii=0x64 }, //d
1437      { .code = 0b11,         .bitcount=2,    .ascii=0x65 }, //e
1438      { .code = 0b111101,     .bitcount=6,    .ascii=0x66 }, //f
1439      { .code = 0b1011011,    .bitcount=7,    .ascii=0x67 }, //g
1440      { .code = 0b101011,     .bitcount=6,    .ascii=0x68 }, //h
1441      { .code = 0b1101,       .bitcount=4,    .ascii=0x69 }, //i
1442      { .code = 0b111101011,  .bitcount=9,    .ascii=0x6a }, //j
1443      { .code = 0b10111111,   .bitcount=8,    .ascii=0x6b }, //k
1444      { .code = 0b11011,      .bitcount=5,    .ascii=0x6c }, //l
1445      { .code = 0b111011,     .bitcount=6,    .ascii=0x6d }, //m
1446      { .code = 0b1111,       .bitcount=4,    .ascii=0x6e }, //n
1447      { .code = 0b111,        .bitcount=3,    .ascii=0x6f }, //o
1448      { .code = 0b111111,     .bitcount=6,    .ascii=0x70 }, //p
1449      { .code = 0b110111111,  .bitcount=9,    .ascii=0x71 }, //q
1450      { .code = 0b10101,      .bitcount=5,    .ascii=0x72 }, //r
1451      { .code = 0b10111,      .bitcount=5,    .ascii=0x73 }, //s
1452      { .code = 0b101,        .bitcount=3,    .ascii=0x74 }, //t
1453      { .code = 0b110111,     .bitcount=6,    .ascii=0x75 }, //u
1454      { .code = 0b1111011,    .bitcount=7,    .ascii=0x76 }, //v
1455      { .code = 0b1101011,    .bitcount=7,    .ascii=0x77 }, //w
1456      { .code = 0b11011111,   .bitcount=8,    .ascii=0x78 }, //x
1457      { .code = 0b1011101,    .bitcount=7,    .ascii=0x79 }, //y
1458      { .code = 0b111010101,  .bitcount=9,    .ascii=0x7a }, //z
1459      { .code = 0b1010110111, .bitcount=10,   .ascii=0x7b }, //{
1460      { .code = 0b110111011,  .bitcount=9,    .ascii=0x7c }, //|
1461      { .code = 0b1010110101, .bitcount=10,   .ascii=0x7d }, //}
1462      { .code = 0b1011010111, .bitcount=10,   .ascii=0x7e }, //~
1463      { .code = 0b1110110101, .bitcount=10,   .ascii=0x7f }, //DEL
1464  };
1465  
1466  unsigned long long psk31_varicode_masklen_helper[] =
1467  {
1468      0b0000000000000000000000000000000000000000000000000000000000000000,
1469      0b0000000000000000000000000000000000000000000000000000000000000001,
1470      0b0000000000000000000000000000000000000000000000000000000000000011,
1471      0b0000000000000000000000000000000000000000000000000000000000000111,
1472      0b0000000000000000000000000000000000000000000000000000000000001111,
1473      0b0000000000000000000000000000000000000000000000000000000000011111,
1474      0b0000000000000000000000000000000000000000000000000000000000111111,
1475      0b0000000000000000000000000000000000000000000000000000000001111111,
1476      0b0000000000000000000000000000000000000000000000000000000011111111,
1477      0b0000000000000000000000000000000000000000000000000000000111111111,
1478      0b0000000000000000000000000000000000000000000000000000001111111111,
1479      0b0000000000000000000000000000000000000000000000000000011111111111,
1480      0b0000000000000000000000000000000000000000000000000000111111111111,
1481      0b0000000000000000000000000000000000000000000000000001111111111111,
1482      0b0000000000000000000000000000000000000000000000000011111111111111,
1483      0b0000000000000000000000000000000000000000000000000111111111111111,
1484      0b0000000000000000000000000000000000000000000000001111111111111111,
1485      0b0000000000000000000000000000000000000000000000011111111111111111,
1486      0b0000000000000000000000000000000000000000000000111111111111111111,
1487      0b0000000000000000000000000000000000000000000001111111111111111111,
1488      0b0000000000000000000000000000000000000000000011111111111111111111,
1489      0b0000000000000000000000000000000000000000000111111111111111111111,
1490      0b0000000000000000000000000000000000000000001111111111111111111111,
1491      0b0000000000000000000000000000000000000000011111111111111111111111,
1492      0b0000000000000000000000000000000000000000111111111111111111111111,
1493      0b0000000000000000000000000000000000000001111111111111111111111111,
1494      0b0000000000000000000000000000000000000011111111111111111111111111,
1495      0b0000000000000000000000000000000000000111111111111111111111111111,
1496      0b0000000000000000000000000000000000001111111111111111111111111111,
1497      0b0000000000000000000000000000000000011111111111111111111111111111,
1498      0b0000000000000000000000000000000000111111111111111111111111111111,
1499      0b0000000000000000000000000000000001111111111111111111111111111111,
1500      0b0000000000000000000000000000000011111111111111111111111111111111,
1501      0b0000000000000000000000000000000111111111111111111111111111111111,
1502      0b0000000000000000000000000000001111111111111111111111111111111111,
1503      0b0000000000000000000000000000011111111111111111111111111111111111,
1504      0b0000000000000000000000000000111111111111111111111111111111111111,
1505      0b0000000000000000000000000001111111111111111111111111111111111111,
1506      0b0000000000000000000000000011111111111111111111111111111111111111,
1507      0b0000000000000000000000000111111111111111111111111111111111111111,
1508      0b0000000000000000000000001111111111111111111111111111111111111111,
1509      0b0000000000000000000000011111111111111111111111111111111111111111,
1510      0b0000000000000000000000111111111111111111111111111111111111111111,
1511      0b0000000000000000000001111111111111111111111111111111111111111111,
1512      0b0000000000000000000011111111111111111111111111111111111111111111,
1513      0b0000000000000000000111111111111111111111111111111111111111111111,
1514      0b0000000000000000001111111111111111111111111111111111111111111111,
1515      0b0000000000000000011111111111111111111111111111111111111111111111,
1516      0b0000000000000000111111111111111111111111111111111111111111111111,
1517      0b0000000000000001111111111111111111111111111111111111111111111111,
1518      0b0000000000000011111111111111111111111111111111111111111111111111,
1519      0b0000000000000111111111111111111111111111111111111111111111111111,
1520      0b0000000000001111111111111111111111111111111111111111111111111111,
1521      0b0000000000011111111111111111111111111111111111111111111111111111,
1522      0b0000000000111111111111111111111111111111111111111111111111111111,
1523      0b0000000001111111111111111111111111111111111111111111111111111111,
1524      0b0000000011111111111111111111111111111111111111111111111111111111,
1525      0b0000000111111111111111111111111111111111111111111111111111111111,
1526      0b0000001111111111111111111111111111111111111111111111111111111111,
1527      0b0000011111111111111111111111111111111111111111111111111111111111,
1528      0b0000111111111111111111111111111111111111111111111111111111111111,
1529      0b0001111111111111111111111111111111111111111111111111111111111111,
1530      0b0011111111111111111111111111111111111111111111111111111111111111,
1531      0b0111111111111111111111111111111111111111111111111111111111111111
1532  };
1533  
1534  const int n_psk31_varicode_items = sizeof(psk31_varicode_items) / sizeof(psk31_varicode_item_t);
1535  
1536  char psk31_varicode_decoder_push(unsigned long long* status_shr, unsigned char symbol)
1537  {
1538      *status_shr=((*status_shr)<<1)|(!!symbol); //shift new bit in shift register
1539      //fprintf(stderr,"*status_shr = %llx\n", *status_shr);
1540      if((*status_shr)&0xFFF==0) return 0;
1541      for(int i=0;i<n_psk31_varicode_items;i++)
1542      {
1543          //fprintf(stderr,"| i = %d | %llx ?= %llx | bitsall = %d\n", i, psk31_varicode_items[i].code<<2, (*status_shr)&psk31_varicode_masklen_helper[(psk31_varicode_items[i].bitcount+4)&63], (psk31_varicode_items[i].bitcount+4)&63);
1544          if((psk31_varicode_items[i].code<<2)==((*status_shr)&psk31_varicode_masklen_helper[(psk31_varicode_items[i].bitcount+4)&63]))
1545              { /*fprintf(stderr,">>>>>>>>> %d %x %c\n", i,  psk31_varicode_items[i].ascii,  psk31_varicode_items[i].ascii);*/ return psk31_varicode_items[i].ascii; }
1546  
1547      }
1548      return 0;
1549  }
1550  
1551  void psk31_varicode_encoder_u8_u8(unsigned char* input, unsigned char* output, int input_size, int output_max_size, int* input_processed, int* output_size)
1552  {
1553      (*output_size)=0;
1554      for((*input_processed)=0; (*input_processed)<input_size; (*input_processed)++)
1555      {
1556          //fprintf(stderr, "ii = %d, input_size = %d, output_max_size = %d\n", *input_processed, input_size, output_max_size);
1557          for(int ci=0; ci<n_psk31_varicode_items; ci++) //ci: character index
1558          {
1559              psk31_varicode_item_t current_varicode = psk31_varicode_items[ci];
1560              if(input[*input_processed]==current_varicode.ascii)
1561              {
1562                  //fprintf(stderr, "ci = %d\n", ci);
1563                  if(output_max_size<current_varicode.bitcount+2) return;
1564                  for(int bi=0; bi<current_varicode.bitcount+2; bi++) //bi: bit index
1565                  {
1566                      //fprintf(stderr, "bi = %d\n", bi);
1567                      output[*output_size] = (bi<current_varicode.bitcount) ? (psk31_varicode_items[ci].code>>(current_varicode.bitcount-bi-1))&1 : 0;
1568                      (*output_size)++;
1569                      output_max_size--;
1570                  }
1571                  break;
1572              }
1573          }
1574      }
1575  }
1576  
1577  rtty_baudot_item_t rtty_baudot_items[] =
1578  {
1579      { .code = 0b00000, .ascii_letter=0,     .ascii_figure=0 },
1580      { .code = 0b10000, .ascii_letter='E',   .ascii_figure='3' },
1581      { .code = 0b01000, .ascii_letter='\n',  .ascii_figure='\n' },
1582      { .code = 0b11000, .ascii_letter='A',   .ascii_figure='-' },
1583      { .code = 0b00100, .ascii_letter=' ',   .ascii_figure=' ' },
1584      { .code = 0b10100, .ascii_letter='S',   .ascii_figure='\'' },
1585      { .code = 0b01100, .ascii_letter='I',   .ascii_figure='8' },
1586      { .code = 0b11100, .ascii_letter='U',   .ascii_figure='7' },
1587      { .code = 0b00010, .ascii_letter='\r',  .ascii_figure='\r' },
1588      { .code = 0b10010, .ascii_letter='D',   .ascii_figure='#' },
1589      { .code = 0b01010, .ascii_letter='R',   .ascii_figure='4' },
1590      { .code = 0b11010, .ascii_letter='J',   .ascii_figure='\a' },
1591      { .code = 0b00110, .ascii_letter='N',   .ascii_figure=',' },
1592      { .code = 0b10110, .ascii_letter='F',   .ascii_figure='@' },
1593      { .code = 0b01110, .ascii_letter='C',   .ascii_figure=':' },
1594      { .code = 0b11110, .ascii_letter='K',   .ascii_figure='(' },
1595      { .code = 0b00001, .ascii_letter='T',   .ascii_figure='5' },
1596      { .code = 0b10001, .ascii_letter='Z',   .ascii_figure='+' },
1597      { .code = 0b01001, .ascii_letter='L',   .ascii_figure=')' },
1598      { .code = 0b11001, .ascii_letter='W',   .ascii_figure='2' },
1599      { .code = 0b00101, .ascii_letter='H',   .ascii_figure='$' },
1600      { .code = 0b10101, .ascii_letter='Y',   .ascii_figure='6' },
1601      { .code = 0b01101, .ascii_letter='P',   .ascii_figure='0' },
1602      { .code = 0b11101, .ascii_letter='Q',   .ascii_figure='1' },
1603      { .code = 0b00011, .ascii_letter='O',   .ascii_figure='9' },
1604      { .code = 0b10011, .ascii_letter='B',   .ascii_figure='?' },
1605      { .code = 0b01011, .ascii_letter='G',   .ascii_figure='*' },
1606      { .code = 0b00111, .ascii_letter='M',   .ascii_figure='.' },
1607      { .code = 0b10111, .ascii_letter='X',   .ascii_figure='/' },
1608      { .code = 0b01111, .ascii_letter='V',   .ascii_figure='=' }
1609  };
1610  
1611  const int n_rtty_baudot_items = sizeof(rtty_baudot_items) / sizeof(rtty_baudot_item_t);
1612  
1613  char rtty_baudot_decoder_lookup(unsigned char* fig_mode, unsigned char c)
1614  {
1615      if(c==RTTY_FIGURE_MODE_SELECT_CODE) { *fig_mode=1; return 0; }
1616      if(c==RTTY_LETTER_MODE_SELECT_CODE) { *fig_mode=0; return 0; }
1617      for(int i=0;i<n_rtty_baudot_items;i++)
1618          if(rtty_baudot_items[i].code==c)
1619              return (*fig_mode) ? rtty_baudot_items[i].ascii_figure : rtty_baudot_items[i].ascii_letter;
1620      return 0;
1621  }
1622  
1623  char rtty_baudot_decoder_push(rtty_baudot_decoder_t* s, unsigned char symbol)
1624  {
1625      //For RTTY waveforms, check this: http://www.ham.hu/radiosatvitel/szoveg/RTTY/kepek/rtty.gif
1626      //RTTY is much like an UART data transfer with 1 start bit, 5 data bits and 1 stop bit.
1627      //The start pulse and stop pulse are used for synchronization.
1628      symbol=!!symbol; //We want symbol to be 0 or 1.
1629      switch(s->state)
1630      {
1631      case RTTY_BAUDOT_WAITING_STOP_PULSE:
1632          if(symbol==1) { s->state = RTTY_BAUDOT_WAITING_START_PULSE; if(s->character_received) return rtty_baudot_decoder_lookup(&s->fig_mode, s->shr&31); }
1633              //If the character data is followed by a stop pulse, then we go on to wait for the next character.
1634          else  s->character_received = 0;
1635              //The character should be followed by a stop pulse. If the stop pulse is missing, that is certainly an error.
1636              //In that case, we remove forget the character we just received.
1637          break;
1638      case RTTY_BAUDOT_WAITING_START_PULSE:
1639          s->character_received = 0;
1640          if(symbol==0) { s->state = RTTY_BAUDOT_RECEIVING_DATA; s->shr = s->bit_cntr = 0; }
1641              //Any number of high bits can come after each other, until interrupted with a low bit (start pulse) to indicate
1642              //the beginning of a new character. If we get this start pulse, we go on to wait for the characters. We also
1643              //clear the variables used for counting (bit_cntr) and storing (shr) the data bits.
1644          break;
1645      case RTTY_BAUDOT_RECEIVING_DATA:
1646          s->shr = (s->shr<<1)|(!!symbol);
1647              //We store 5 bits into our shift register
1648          if(s->bit_cntr++==4) { s->state = RTTY_BAUDOT_WAITING_STOP_PULSE; s->character_received = 1; }
1649              //If this is the 5th bit stored, then we wait for the stop pulse.
1650          break;
1651      default: break;
1652      }
1653      return 0;
1654  }
1655  
1656  #define DEBUG_SERIAL_LINE_DECODER 0
1657  
1658  //What has not been checked:
1659  //  behaviour on 1.5 stop bits
1660  //  check all exit conditions
1661  
1662  void serial_line_decoder_f_u8(serial_line_t* s, float* input, unsigned char* output, int input_size)
1663  {
1664      static int abs_samples_helper = 0;
1665      abs_samples_helper += s->input_used;
1666      int iabs_samples_helper = abs_samples_helper;
1667      s->output_size = 0;
1668      s->input_used = 0;
1669      short* output_s = (short*)output;
1670      unsigned* output_u = (unsigned*)output;
1671      for(;;)
1672      {
1673          //we find the start bit (first negative edge on the line)
1674          int startbit_start = -1;
1675          int i;
1676          for(i=1;i<input_size;i++) if(input[i] < 0 && input[i-1] > 0) { startbit_start=i; break; }
1677  
1678          if(startbit_start == -1) { s->input_used += i; DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:startbit_not_found (+%d)\n", s->input_used); return; }
1679          DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:startbit_found at %d (%d)\n", startbit_start, iabs_samples_helper + startbit_start);
1680  
1681          //If the stop bit would be too far so that we reached the end of the buffer, then we return failed.
1682          //The caller can rearrange the buffer so that the whole character fits into it.
1683          float all_bits = 1 + s->databits + s->stopbits;
1684          DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:all_bits = %f\n", all_bits);
1685          if(startbit_start + s->samples_per_bits * all_bits >= input_size) { s->input_used += MAX_M(0,startbit_start-2); DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:return_stopbit_too_far (+%d)\n", s->input_used); return; }
1686  
1687          //We do the actual sampling.
1688          int di; //databit counter
1689          unsigned shr = 0;
1690          for(di=0; di < s->databits; di++)
1691          {
1692              int databit_start = startbit_start + (1+di+(0.5*(1-s->bit_sampling_width_ratio))) * s->samples_per_bits;
1693              int databit_end   = startbit_start + (1+di+(0.5*(1+s->bit_sampling_width_ratio))) * s->samples_per_bits;
1694              DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:databit_start = %d (%d)\n", databit_start, iabs_samples_helper+databit_start);
1695              DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:databit_end   = %d (%d)\n", databit_end,   iabs_samples_helper+databit_end);
1696              float databit_acc = 0;
1697              for(i=databit_start;i<databit_end;i++) { databit_acc += input[i]; /*DEBUG_SERIAL_LINE_DECODER && fprintf(stderr, "%f (%f) ", input[i], databit_acc);*/ }
1698              //DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"\n");
1699              DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:databit_decision = %d\n", !!(databit_acc>0));
1700              shr=(shr<<1)|!!(databit_acc>0);
1701          }
1702          DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:shr = 0x%x, %d\n", shr, shr);
1703  
1704          //We check if the stopbit is correct.
1705          int stopbit_start = startbit_start + (1+s->databits) * s->samples_per_bits + (s->stopbits * 0.5 * (1-s->bit_sampling_width_ratio)) * s->samples_per_bits;
1706          int stopbit_end   = startbit_start + (1+s->databits) * s->samples_per_bits + (s->stopbits * 0.5 * (1+s->bit_sampling_width_ratio)) * s->samples_per_bits;
1707          DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:stopbit_start = %d (%d)\n", stopbit_start, iabs_samples_helper+stopbit_start);
1708          DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:stopbit_end   = %d (%d)\n", stopbit_end,   iabs_samples_helper+stopbit_end);
1709          float stopbit_acc = 0;
1710          for(i=stopbit_start;i<stopbit_end;i++) { stopbit_acc += input[i]; DEBUG_SERIAL_LINE_DECODER && fprintf(stderr, "%f (%f) ", input[i], stopbit_acc); }
1711          DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"\n");
1712          if(stopbit_acc<0) { s->input_used += MIN_M(startbit_start + 1, input_size); DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:return_stopbit_faulty (+%d)\n", s->input_used); return; }
1713          DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:stopbit_found\n");
1714  
1715          //we write the output sample
1716          if(s->databits <= 8) output[s->output_size] = shr;
1717          else if(s->databits <= 16) output_s[s->output_size] = shr;
1718          else output_u[s->output_size] = shr;
1719          s->output_size++;
1720  
1721          int samples_used_up_now = MIN_M(startbit_start + all_bits * s->samples_per_bits, input_size);
1722          s->input_used += samples_used_up_now;
1723          input += samples_used_up_now;
1724          input_size -= samples_used_up_now;
1725          iabs_samples_helper += samples_used_up_now;
1726          if(!input_size) { DEBUG_SERIAL_LINE_DECODER && fprintf(stderr,"sld:return_no_more_input (+%d)\n", s->input_used); return; }
1727      }
1728      DEBUG_SERIAL_LINE_DECODER && fprintf(stderr, "sld: >> output_size = %d  (+%d)\n", s->output_size, s->input_used);
1729  }
1730  
1731  void generic_slicer_f_u8(float* input, unsigned char* output, int input_size, int n_symbols)
1732  {
1733      float symbol_distance = 2.0/(n_symbols-1);
1734      for(int i=0;i<input_size;i++) 
1735          for(int j=0;j<n_symbols;j++)
1736          {
1737              float symbol_center = -1+j*symbol_distance;
1738              float symbol_low_limit = symbol_center-(symbol_distance/2);
1739              float symbol_high_limit = symbol_center+(symbol_distance/2);
1740              if(j==0)
1741              {
1742                  if(input[i]<symbol_high_limit) 
1743                  {
1744                      output[i]=j;
1745                      break;
1746                  }
1747              }
1748              else if (j==n_symbols-1)
1749              {
1750                  if(input[i]>=symbol_low_limit) 
1751                  {
1752                      output[i]=j;
1753                      break;
1754                  }
1755              }
1756              else 
1757              {
1758                  if(input[i]>=symbol_low_limit && input[i]<symbol_high_limit) 
1759                  {
1760                      output[i]=j;
1761                      break;
1762                  }
1763              }
1764          }
1765  }
1766  
1767  void binary_slicer_f_u8(float* input, unsigned char* output, int input_size)
1768  {
1769      for(int i=0;i<input_size;i++) output[i] = input[i] > 0;
1770  }
1771  
1772  void psk_modulator_u8_c(unsigned char* input, complexf* output, int input_size, int n_psk)
1773  {
1774      //outputs one complex sample per input symbol
1775      float phase_increment = (2*M_PI)/n_psk;
1776      for(int i=0;i<input_size;i++)
1777      {
1778          float out_phase=phase_increment*input[i];
1779          iof(output,i)=cos(out_phase);
1780          qof(output,i)=sin(out_phase);
1781      }
1782  }
1783  
1784  void duplicate_samples_ntimes_u8_u8(unsigned char* input, unsigned char* output, int input_size_bytes, int sample_size_bytes, int ntimes)
1785  {
1786      int l=0;
1787      for(int i=0;i<input_size_bytes;i+=sample_size_bytes)
1788          for(int k=0;k<ntimes;k++)
1789              for(int j=0;j<sample_size_bytes;j++)
1790                  output[l++]=input[i+j];
1791  }
1792  
1793  complexf psk31_interpolate_sine_cc(complexf* input, complexf* output, int input_size, int interpolation, complexf last_input)
1794  {
1795      int oi=0; //output index
1796      for(int i=0;i<input_size;i++)
1797      {
1798          for(int j=0; j<interpolation; j++)
1799          {
1800              float rate = (1+sin(-(M_PI/2)+M_PI*((j+1)/(float)interpolation)))/2;
1801              iof(output,oi)=iof(input,i) * rate + iof(&last_input,0) * (1-rate);
1802              qof(output,oi)=qof(input,i) * rate + qof(&last_input,0) * (1-rate);
1803              oi++;
1804          }
1805          last_input = input[i];
1806      }
1807      return last_input;
1808  }
1809  
1810  void pack_bits_1to8_u8_u8(unsigned char* input, unsigned char* output, int input_size)
1811  { //output size should be input_size × 8
1812      for(int i=0; i<input_size; i++)
1813          for(int bi=0; bi<8; bi++) //bi: bit index
1814              *(output++)=(input[i]>>bi)&1;
1815  }
1816  
1817  
1818  unsigned char pack_bits_8to1_u8_u8(unsigned char* input)
1819  {
1820      unsigned char output;
1821      for(int i=0;i<8;i++)
1822      {
1823          output<<=1;
1824          output|=!!input[i];
1825      }
1826      return output;
1827  }
1828  unsigned char differential_codec(unsigned char* input, unsigned char* output, int input_size, int encode, unsigned char state)
1829  {
1830      if(!encode)
1831          for(int i=0;i<input_size;i++) 
1832          {
1833              output[i] = input[i] == state;
1834              state = input[i];
1835          }
1836      else
1837          for(int i=0;i<input_size;i++) 
1838          {
1839              if(!input[i]) state=!state;
1840              output[i] = state;
1841          }
1842      return state;
1843  }
1844  
1845  /*
1846     _____                _                      _______ _           _                _____
1847    / ____|              (_)             ___    |__   __(_)         (_)              |  __ \
1848   | |     __ _ _ __ _ __ _  ___ _ __   ( _ )      | |   _ _ __ ___  _ _ __   __ _   | |__) |___  ___ _____   _____ _ __ _   _
1849   | |    / _` | '__| '__| |/ _ \ '__|  / _ \/\    | |  | | '_ ` _ \| | '_ \ / _` |  |  _  // _ \/ __/ _ \ \ / / _ \ '__| | | |
1850   | |___| (_| | |  | |  | |  __/ |    | (_>  <    | |  | | | | | | | | | | | (_| |  | | \ \  __/ (_| (_) \ V /  __/ |  | |_| |
1851    \_____\__,_|_|  |_|  |_|\___|_|     \___/\/    |_|  |_|_| |_| |_|_|_| |_|\__, |  |_|  \_\___|\___\___/ \_/ \___|_|   \__, |
1852                                                                              __/ |                                       __/ |
1853                                                                             |___/                                       |___/
1854  */
1855  
1856  void pll_cc_init_pi_controller(pll_t* p, float bandwidth, float ko, float kd, float damping_factor)
1857  {
1858      //kd: detector gain
1859      //ko: VCO gain
1860      float bandwidth_omega = 2*M_PI*bandwidth;
1861      p->alpha  = (damping_factor*2*bandwidth_omega)/(ko*kd);
1862      float sampling_rate = 1; //the bandwidth is normalized to the sampling rate
1863      p->beta   = (bandwidth_omega*bandwidth_omega)/(sampling_rate*ko*kd);
1864      p->iir_temp = p->dphase = p->output_phase = 0;
1865  }
1866  
1867  void pll_cc_init_p_controller(pll_t* p, float alpha)
1868  {
1869      p->alpha = alpha;
1870      p->dphase=p->output_phase=0;
1871  }
1872  
1873  
1874  void pll_cc(pll_t* p, complexf* input, float* output_dphase, complexf* output_nco, int input_size)
1875  {
1876      for(int i=0;i<input_size;i++)
1877      {
1878          p->output_phase += p->dphase;
1879          while(p->output_phase>PI) p->output_phase-=2*PI;
1880          while(p->output_phase<-PI) p->output_phase+=2*PI;
1881          complexf current_nco;
1882          iof(&current_nco,0) = sin(p->output_phase);
1883          qof(&current_nco,0) = cos(p->output_phase);
1884          if(output_nco) output_nco[i] = current_nco; //we don't output anything if it is a NULL pointer
1885  
1886          //accurate phase detector: calculating error from phase offset
1887          float input_phase = atan2(iof(input,i),qof(input,i));
1888          float new_dphase = input_phase - p->output_phase;
1889          while(new_dphase>PI) new_dphase-=2*PI;
1890          while(new_dphase<-PI) new_dphase+=2*PI;
1891  
1892          //modeling analog phase detector, which would be abs(input[i] * current_nco) if we had a real output signal, but what if we have complex signals?
1893          //qof(&current_nco,0)=-qof(&current_nco,0); //calculate conjugate
1894          //complexf multiply_result;
1895          //cmult(&multiply_result, &input[i], &current_nco);
1896          //output_nco[i] = multiply_result;
1897          //float new_dphase = absof(&multiply_result,0);
1898  
1899          if(p->pll_type == PLL_PI_CONTROLLER)
1900          {
1901              p->dphase = new_dphase * p->alpha + p->iir_temp;
1902              p->iir_temp += new_dphase * p->beta;
1903  
1904              while(p->dphase>PI) p->dphase-=2*PI; //won't need this one
1905              while(p->dphase<-PI) p->dphase+=2*PI;
1906          }
1907          else if(p->pll_type == PLL_P_CONTROLLER)
1908          {
1909              p->dphase = new_dphase * p->alpha;
1910          }
1911          else return;
1912          if(output_dphase) output_dphase[i] = -p->dphase;
1913          //if(output_dphase) output_dphase[i] = new_dphase/10;
1914      }
1915  }
1916  
1917  void octave_plot_point_on_cplxsig(complexf* signal, int signal_size, float error, int index, int correction_offset, char* writefiles_path, int points_size, ...)
1918  {
1919      static int figure_output_counter = 0;
1920      int* points_z = (int*)malloc(sizeof(int)*points_size);
1921      int* points_color = (int*)malloc(sizeof(int)*points_size);
1922      va_list vl;
1923      va_start(vl,points_size);
1924      for(int i=0;i<points_size;i++)
1925      {
1926          points_z[i] = va_arg(vl, int);
1927          points_color[i] = va_arg(vl, int);
1928      }
1929      if(writefiles_path && !figure_output_counter) fprintf(stderr, "cf=figure();\n");
1930      fprintf(stderr, "N = %d;\nisig = [", signal_size);
1931      for(int i=0;i<signal_size;i++) fprintf(stderr, "%f ", iof(signal,i));
1932      fprintf(stderr, "];\nqsig = [");
1933      for(int i=0;i<signal_size;i++) fprintf(stderr, "%f ", qof(signal,i));
1934      fprintf(stderr, "];\nzsig = [0:N-1];\nsubplot(2,2,[2 4]);\nplot3(isig,zsig,qsig,\"b-\",");
1935      for(int i=0;i<points_size;i++)
1936          fprintf(stderr, "[%f],[%d],[%f],\"%c.\"%c", 
1937              iof(signal, points_z[i]), points_z[i], qof(signal, points_z[i]), 
1938              (char)points_color[i]&0xff, (i<points_size-1)?',':' '
1939          );
1940      va_end(vl);
1941      fprintf(stderr, ");\ntitle(\"index = %d, error = %f, cxoffs = %d\");\nsubplot(2,2,1);\nplot(zsig, isig,\"b-\",", index, error, correction_offset);
1942      for(int i=0;i<points_size;i++)
1943          fprintf(stderr, "[%d],[%f],\"%c.\"%c", 
1944              points_z[i], iof(signal, points_z[i]),
1945              (char)points_color[i]&0xff, (i<points_size-1)?',':' '
1946          );
1947      fprintf(stderr, ");\nsubplot(2,2,3);\nplot(zsig, qsig,\"b-\",");
1948      for(int i=0;i<points_size;i++)
1949          fprintf(stderr, "[%d],[%f],\"%c.\"%c", 
1950              points_z[i], qof(signal, points_z[i]),
1951              (char)points_color[i]&0xff, (i<points_size-1)?',':' '
1952          );
1953      fprintf(stderr, ");\n");
1954      if(writefiles_path) fprintf(stderr, "print(cf, \"%s/%05d.png\", \"-S1024,1024\");\n", writefiles_path, figure_output_counter++); 
1955      fflush(stderr);
1956      free(points_z);
1957      free(points_color);
1958  }
1959  
1960  timing_recovery_state_t timing_recovery_init(timing_recovery_algorithm_t algorithm, int decimation_rate, int use_q, float loop_gain, float max_error, int debug_every_nth, char* debug_writefiles_path)
1961  {
1962      timing_recovery_state_t to_return;
1963      to_return.algorithm = algorithm;
1964      to_return.decimation_rate = decimation_rate;
1965      to_return.loop_gain = loop_gain;
1966      to_return.max_error = max_error;
1967      to_return.use_q = use_q;
1968      to_return.debug_phase = to_return.debug_every_nth = debug_every_nth; //debug is effective if it is >=0
1969      to_return.last_correction_offset = 0;
1970      to_return.earlylate_ratio = 0.25; //0..0.5
1971      to_return.debug_writefiles_path = debug_writefiles_path;
1972      return to_return;
1973  }
1974  
1975  #define MTIMINGR_HDEBUG 0
1976  
1977  void timing_recovery_cc(complexf* input, complexf* output, int input_size, float* timing_error, int* sampled_indexes, timing_recovery_state_t* state)
1978  {
1979      //We always assume that the input starts at center of the first symbol cross before the first symbol.
1980      //Last time we consumed that much from the input samples that it is there.
1981      int correction_offset = state->last_correction_offset;
1982      int current_bitstart_index = 0;
1983      int num_samples_bit = state->decimation_rate;
1984      int num_samples_halfbit = state->decimation_rate / 2;
1985      int num_samples_quarterbit = state->decimation_rate / 4;
1986      int num_samples_earlylate_wing = num_samples_bit * state->earlylate_ratio;
1987      float error;
1988      int el_point_left_index, el_point_right_index, el_point_mid_index;
1989      int si = 0;
1990      if(state->debug_every_nth>=0) fprintf(stderr, "disp(\"begin timing_recovery_cc\");\n");
1991      if(MTIMINGR_HDEBUG) fprintf(stderr, "timing_recovery_cc started, nsb = %d, nshb = %d, nsqb = %d\n", num_samples_bit, num_samples_halfbit, num_samples_quarterbit);
1992      {
1993          for(;;)
1994          {
1995              //the MathWorks style algorithm has correction_offset.
1996              //correction_offset = 0;            
1997              if(current_bitstart_index + num_samples_halfbit * 3 >= input_size) break;
1998              if(MTIMINGR_HDEBUG) fprintf(stderr, "current_bitstart_index = %d, input_size = %d, correction_offset(prev) = %d\n", 
1999                      current_bitstart_index, input_size, correction_offset);
2000              
2001              if(correction_offset<=-num_samples_quarterbit*0.9 || correction_offset>=0.9*num_samples_quarterbit)
2002              {
2003                  if(MTIMINGR_HDEBUG) fprintf(stderr, "correction_offset = %d, reset to 0!\n", correction_offset); 
2004                  correction_offset = 0;
2005              }
2006              //should check if the sign of the correction_offset (or disabling it) has an effect on the EVM.
2007              //it is also a possibility to disable multiplying with the magnitude
2008              if(state->algorithm == TIMING_RECOVERY_ALGORITHM_EARLYLATE)
2009              {
2010                  //bitstart index should be at symbol edge, maximum effect point is at current_bitstart_index + num_samples_halfbit
2011                  el_point_right_index  = current_bitstart_index + num_samples_earlylate_wing * 3;
2012                  el_point_left_index   = current_bitstart_index + num_samples_earlylate_wing * 1 - correction_offset;
2013                  el_point_mid_index    = current_bitstart_index + num_samples_halfbit;
2014                  if(sampled_indexes) sampled_indexes[si]=el_point_mid_index;
2015                  output[si++] = input[el_point_mid_index];
2016              }
2017              else if(state->algorithm == TIMING_RECOVERY_ALGORITHM_GARDNER)
2018              {
2019                  //maximum effect point is at current_bitstart_index
2020                  el_point_right_index  = current_bitstart_index + num_samples_halfbit * 3;
2021                  el_point_left_index   = current_bitstart_index + num_samples_halfbit * 1;
2022                  el_point_mid_index    = current_bitstart_index + num_samples_halfbit * 2;
2023                  if(sampled_indexes) sampled_indexes[si]=el_point_left_index;
2024                  output[si++] = input[el_point_left_index];
2025              }
2026              else break;
2027  
2028              error = ( iof(input, el_point_right_index) - iof(input, el_point_left_index) ) * iof(input, el_point_mid_index); 
2029              if(state->use_q)
2030              {
2031                  error += ( qof(input, el_point_right_index) - qof(input, el_point_left_index)) * qof(input, el_point_mid_index); 
2032                  error /= 2;
2033              }
2034              //Original correction method: this version can only move a single sample in any direction
2035              //current_bitstart_index += num_samples_halfbit * 2 + (error)?((error<0)?1:-1):0;
2036  
2037              if(timing_error) timing_error[si-1]=error; //it is not written if NULL
2038              
2039              if(error>state->max_error) error=state->max_error;
2040              if(error<-state->max_error) error=-state->max_error;
2041              if(state->debug_every_nth>=0)
2042              {
2043                  if(state->debug_every_nth==0 || state->debug_phase==0) 
2044                  {
2045                      state->debug_phase = state->debug_every_nth;
2046                      octave_plot_point_on_cplxsig(input+current_bitstart_index, state->decimation_rate*2, 
2047                          error, 
2048                          current_bitstart_index, 
2049                          correction_offset,
2050                          state->debug_writefiles_path,
2051                          3,
2052                          el_point_left_index - current_bitstart_index,  'r',
2053                          el_point_right_index - current_bitstart_index, 'r',
2054                          el_point_mid_index - current_bitstart_index,   'r',
2055                          0);
2056                  }
2057                  else state->debug_phase--;
2058              }
2059              int error_sign = (state->algorithm == TIMING_RECOVERY_ALGORITHM_GARDNER) ? -1 : 1;
2060              correction_offset = num_samples_halfbit * error_sign * error * state->loop_gain;
2061              current_bitstart_index += num_samples_bit + correction_offset;
2062              if(si>=input_size) 
2063              { 
2064                  if(MTIMINGR_HDEBUG) fprintf(stderr, "oops_out_of_si!\n"); 
2065                  break; 
2066              }
2067          }
2068      }
2069      state->input_processed = current_bitstart_index;
2070      state->output_size = si;
2071      state->last_correction_offset = correction_offset;
2072  }
2073  
2074  #define MTIMINGR_GAS(NAME) \
2075      if(!strcmp( #NAME , input )) return TIMING_RECOVERY_ALGORITHM_ ## NAME;
2076  
2077  timing_recovery_algorithm_t timing_recovery_get_algorithm_from_string(char* input)
2078  {
2079      MTIMINGR_GAS(GARDNER);
2080      MTIMINGR_GAS(EARLYLATE);
2081      return TIMING_RECOVERY_ALGORITHM_DEFAULT;
2082  }
2083  
2084  #define MTIMINGR_GSA(NAME) \
2085      if(algorithm == TIMING_RECOVERY_ALGORITHM_ ## NAME) return #NAME;
2086  
2087  char* timing_recovery_get_string_from_algorithm(timing_recovery_algorithm_t algorithm)
2088  {
2089      MTIMINGR_GSA(GARDNER);
2090      MTIMINGR_GSA(EARLYLATE);
2091      return "INVALID";
2092  }
2093  
2094  void init_bpsk_costas_loop_cc(bpsk_costas_loop_state_t* s, int decision_directed, float damping_factor, float bandwidth)
2095  {
2096      //fprintf(stderr, "init_bpsk_costas_loop_cc: bandwidth = %f, damping_factor = %f\n", bandwidth, damping_factor);
2097      //based on: http://gnuradio.squarespace.com/blog/2011/8/13/control-loop-gain-values.html
2098      float bandwidth_omega = 2*PI*bandwidth; //so that the bandwidth should be around 0.01 by default (2pi/100), and the damping_factor should be default 0.707
2099      float denomiator = 1+2*damping_factor*bandwidth_omega+bandwidth_omega*bandwidth_omega;
2100      fprintf(stderr, "damp = %f, bw = %f, bwomega = %f\n", damping_factor, bandwidth, bandwidth_omega);
2101      s->alpha = (4*damping_factor*bandwidth_omega)/denomiator;
2102      s->beta = (4*bandwidth_omega*bandwidth_omega)/denomiator;
2103      s->current_freq = s->dphase = s->nco_phase = 0;
2104      s->dphase_max=bandwidth_omega; //this has been determined by experiment: if dphase is out of [-dphase_max; dphase_max] it might actually hang and not come back 
2105      s->dphase_max_reset_to_zero=0;
2106  }
2107  
2108  void bpsk_costas_loop_cc(complexf* input, complexf* output, int input_size, float* output_error, float* output_dphase, complexf* output_nco, bpsk_costas_loop_state_t* s)
2109  {
2110      for(int i=0;i<input_size;i++)
2111      {
2112          complexf nco_sample;
2113          e_powj(&nco_sample, s->nco_phase);
2114          cmult(&output[i], &input[i], &nco_sample);
2115          if(output_nco) output_nco[i]=nco_sample;
2116          float error = 0;
2117          if(s->decision_directed)
2118          {
2119              float output_phase = atan2(qof(output,i),iof(output,i));
2120              if (fabs(output_phase)<PI/2) 
2121                  error = -output_phase;
2122              else
2123              {
2124                  error = PI-output_phase;
2125                  while(error>PI) error -= 2*PI;
2126              }
2127          }
2128          else error = PI*iof(output,i)*qof(output,i);
2129          if(output_error) output_error[i]=error;
2130          s->current_freq += error * s->beta;
2131          s->dphase = error * s->alpha + s->current_freq;
2132          if(s->dphase>s->dphase_max)  s->dphase = (s->dphase_max_reset_to_zero) ? 0 : s->dphase_max;
2133          if(s->dphase<-s->dphase_max) s->dphase = (s->dphase_max_reset_to_zero) ? 0 : -s->dphase_max;
2134          if(output_dphase) output_dphase[i]=s->dphase;
2135          //fprintf(stderr, "  error = %f; dphase = %f; nco_phase = %f;\n", error, s->dphase, s->nco_phase);
2136  
2137          //step NCO
2138          s->nco_phase += s->dphase;
2139          while(s->nco_phase>2*PI) s->nco_phase-=2*PI;
2140          while(s->nco_phase<=0) s->nco_phase+=2*PI;
2141      }
2142  }
2143  
2144  #if 0
2145  bpsk_costas_loop_state_t init_bpsk_costas_loop_cc(float samples_per_bits) 
2146  { 
2147      bpsk_costas_loop_state_t state;
2148      state.vco_phase = 0;
2149      state.last_vco_phase_addition = 0;
2150      float virtual_sampling_rate = 10000;
2151      float virtual_data_rate = virtual_sampling_rate / samples_per_bits;
2152      fprintf(stderr, "virtual_sampling_rate = %g, virtual_data_rate = %g\n", virtual_sampling_rate, virtual_data_rate);
2153      float rc_filter_cutoff = virtual_data_rate * 2; //this is so far the best
2154      float rc_filter_rc = 1/(2*M_PI*rc_filter_cutoff); //as of Equation 24 in Feigin
2155      float virtual_sampling_dt = 1.0/virtual_sampling_rate;
2156      fprintf(stderr, "rc_filter_cutoff = %g, rc_filter_rc = %g, virtual_sampling_dt = %g\n", 
2157          rc_filter_cutoff, rc_filter_rc, virtual_sampling_dt);
2158      state.rc_filter_alpha = virtual_sampling_dt/(rc_filter_rc+virtual_sampling_dt); //https://en.wikipedia.org/wiki/Low-pass_filter
2159      float rc_filter_omega_cutoff = 2*M_PI*rc_filter_cutoff;
2160      state.vco_phase_addition_multiplier = 8*rc_filter_omega_cutoff / (virtual_sampling_rate); //as of Equation 25 in Feigin, assuming input signal amplitude of 1 (to 1V) and (state.vco_phase_addition_multiplier*<vco_input>), a value in radians, will be added to the vco_phase directly.
2161      fprintf(stderr, "rc_filter_alpha = %g, rc_filter_omega_cutoff = %g, vco_phase_addition_multiplier = %g\n", 
2162              state.rc_filter_alpha, rc_filter_omega_cutoff, state.vco_phase_addition_multiplier);
2163      return state;
2164  }
2165  
2166  void bpsk_costas_loop_c1mc(complexf* input, complexf* output, int input_size, bpsk_costas_loop_state_t* state)
2167  {
2168      int debug = 0;
2169      if(debug) fprintf(stderr, "costas:\n");
2170      for(int i=0;i<input_size;i++)
2171      {
2172          float input_phase = atan2(input[i].q, input[i].i);
2173          float input_and_vco_mixed_phase = input_phase - state->vco_phase;
2174          if(debug) fprintf(stderr, "%g | %g\n", input_and_vco_mixed_phase, input_phase), debug--;
2175          complexf input_and_vco_mixed_sample; 
2176          e_powj(&input_and_vco_mixed_sample, input_and_vco_mixed_phase);
2177          
2178          complexf vco_sample;
2179          e_powj(&vco_sample, -state->vco_phase);
2180          //cmult(&input_and_vco_mixed_sample, &input[i], &vco_sample);//if this is enabled, the real input sample is used, not the amplitude normalized 
2181  
2182          float loop_output_i = 
2183              input_and_vco_mixed_sample.i * state->rc_filter_alpha + state->last_lpfi_output * (1-state->rc_filter_alpha);
2184          float loop_output_q = 
2185              input_and_vco_mixed_sample.q * state->rc_filter_alpha + state->last_lpfq_output * (1-state->rc_filter_alpha);
2186          //loop_output_i = input_and_vco_mixed_sample.i;
2187          //loop_output_q = input_and_vco_mixed_sample.q;
2188          state->last_lpfi_output = loop_output_i;
2189          state->last_lpfq_output = loop_output_q;
2190          float vco_phase_addition = loop_output_i * loop_output_q * state->vco_phase_addition_multiplier;
2191          //vco_phase_addition = vco_phase_addition * state->rc_filter_alpha + state->last_vco_phase_addition * (1-state->rc_filter_alpha);
2192          //state->last_vco_phase_addition = vco_phase_addition;
2193          state->vco_phase += vco_phase_addition;
2194          while(state->vco_phase>PI) state->vco_phase-=2*PI;
2195          while(state->vco_phase<-PI) state->vco_phase+=2*PI;
2196          cmult(&output[i], &input[i], &vco_sample);
2197      }
2198  }
2199  #endif
2200  
2201  void simple_agc_cc(complexf* input, complexf* output, int input_size, float rate, float reference, float max_gain, float* current_gain)
2202  {
2203      float rate_1minus=1-rate;
2204      int debugn = 0;
2205      for(int i=0;i<input_size;i++)
2206      {
2207          float amplitude = sqrt(input[i].i*input[i].i+input[i].q*input[i].q);
2208          float ideal_gain = (reference/amplitude);
2209          if(ideal_gain>max_gain) ideal_gain = max_gain;
2210          if(ideal_gain<=0) ideal_gain = 0;
2211          //*current_gain += (ideal_gain-(*current_gain))*rate;
2212          *current_gain = (ideal_gain-(*current_gain))*rate + (*current_gain)*rate_1minus;
2213          //if(debugn<100) fprintf(stderr, "cgain: %g\n", *current_gain), debugn++;
2214          output[i].i=(*current_gain)*input[i].i;
2215          output[i].q=(*current_gain)*input[i].q;
2216      }
2217  }
2218  
2219  void firdes_add_peak_c(complexf* output, int length, float rate, window_t window, int add, int normalize)
2220  {
2221      //add=0: malloc output previously
2222      //add=1: calloc output previously
2223      complexf* taps = (complexf*)malloc(sizeof(complexf)*length);
2224      int middle=length/2;
2225      float phase = 0, phase_addition = -rate*M_PI*2;
2226      float (*window_function)(float) = firdes_get_window_kernel(window);
2227      for(int i=0; i<length; i++) //@@firdes_add_peak_c: calculate taps
2228      {
2229          e_powj(&taps[i], phase);
2230          float window_multiplier = window_function(fabs((float)(middle-i)/middle));
2231          taps[i].i *= window_multiplier;
2232          taps[i].q *= window_multiplier;
2233          phase += phase_addition;
2234          while(phase>2*M_PI) phase-=2*M_PI;
2235          while(phase<0) phase+=2*M_PI;
2236      }
2237  
2238      //Normalize filter kernel
2239      if(add) 
2240          for(int i=0;i<length;i++)
2241          {
2242              output[i].i += taps[i].i;
2243              output[i].q += taps[i].q;
2244          }
2245      else for(int i=0;i<length;i++) output[i] = taps[i];
2246      if(normalize)
2247      {
2248          float sum=0;
2249          for(int i=0;i<length;i++) //@firdes_add_peak_c: normalize pass 1
2250          {
2251              sum+=sqrt(output[i].i*output[i].i + output[i].q*output[i].q);
2252          }
2253          for(int i=0;i<length;i++) //@firdes_add_peak_c: normalize pass 2
2254          {
2255              output[i].i/=sum;
2256              output[i].q/=sum;
2257          }
2258      }
2259  }
2260  
2261  int apply_fir_cc(complexf* input, complexf* output, int input_size, complexf* taps, int taps_length)
2262  {
2263      int i;
2264      for(i=0; i<input_size-taps_length+1; i++)
2265      {
2266          csetnull(&output[i]);
2267          for(int ti=0;ti<taps_length;ti++)
2268          {
2269              cmultadd(&output[i], &input[i+ti], &taps[ti]);
2270          }
2271      }
2272      return i;
2273  }
2274  
2275  
2276  int apply_real_fir_cc(complexf* input, complexf* output, int input_size, float* taps, int taps_length)
2277  {
2278      int i;
2279      for(i=0; i<input_size-taps_length+1; i++)
2280      {
2281          float acci = 0, accq = 0;
2282          for(int ti=0;ti<taps_length;ti++)
2283          {
2284              acci += iof(input,i+ti)*taps[ti];
2285              accq += qof(input,i+ti)*taps[ti];
2286          }
2287          iof(output,i)=acci;
2288          qof(output,i)=accq;
2289      }
2290      return i;
2291  }
2292  
2293  float normalized_timing_variance_u32_f(unsigned* input, float* temp, int input_size, int samples_per_symbol, int initial_sample_offset, int debug_print)
2294  {
2295      float *ndiff_rad = temp;
2296      float ndiff_rad_mean = 0;
2297      for(int i=0;i<input_size;i++) 
2298      {
2299          //find out which real sample index this input sample index is the nearest to.
2300          unsigned sinearest = (input[i]-initial_sample_offset) / samples_per_symbol;
2301          unsigned sinearest_remain = (input[i]-initial_sample_offset) % samples_per_symbol;
2302          if(sinearest_remain>samples_per_symbol/2) sinearest++;
2303          unsigned socorrect = initial_sample_offset+(sinearest*samples_per_symbol); //the sample offset which input[i] should have been, in order to sample at the maximum effect point
2304          int sodiff = abs(socorrect-input[i]);
2305          float ndiff = (float)sodiff/samples_per_symbol;
2306  
2307          ndiff_rad[i] = ndiff*PI;
2308          ndiff_rad_mean = ndiff_rad_mean*(((float)i)/(i+1))+(ndiff_rad[i]/(i+1));
2309          if(debug_print) fprintf(stderr, "input[%d] = %u, sinearest = %u, socorrect = %u, sodiff = %u, ndiff = %f, ndiff_rad[i] = %f, ndiff_rad_mean = %f\n", i, input[i], sinearest, socorrect, sodiff, ndiff, ndiff_rad[i], ndiff_rad_mean);
2310      }
2311      fprintf(stderr, "ndiff_rad_mean = %f\n", ndiff_rad_mean);
2312  
2313      float result = 0;
2314      for(int i=0;i<input_size;i++) result+=(powf(ndiff_rad[i]-ndiff_rad_mean,2))/(input_size-1);
2315      //fprintf(stderr, "nv = %f\n", result);
2316      return result;
2317  }
2318  
2319  void dbpsk_decoder_c_u8(complexf* input, unsigned char* output, int input_size)
2320  {
2321      static complexf last_input;
2322      for(int i=0;i<input_size;i++)
2323      {
2324          float phase = atan2(qof(input,i), iof(input,i));
2325          float last_phase = atan2(qofv(last_input), iofv(last_input));
2326          float dphase = phase-last_phase;
2327          while(dphase<-PI) dphase+=2*PI;
2328          while(dphase>=PI) dphase-=2*PI;
2329          if( (dphase>(PI/2)) || (dphase<(-PI/2)) ) output[i]=0;
2330          else output[i]=1;
2331          last_input = input[i];
2332      }
2333  }
2334  
2335  int bfsk_demod_cf(complexf* input, float* output, int input_size, complexf* mark_filter, complexf* space_filter, int taps_length)
2336  {
2337      complexf acc_space, acc_mark;
2338      for(int i=0; i<input_size-taps_length+1; i++)
2339      {
2340          csetnull(&acc_space);
2341          csetnull(&acc_mark);
2342          for(int ti=0;ti<taps_length;ti++)
2343          {
2344              cmultadd(&acc_space, &input[i+ti], &space_filter[ti]);
2345              cmultadd(&acc_mark,  &input[i+ti], &mark_filter[ti]);
2346          }
2347          output[i] = - ( iofv(acc_space)*iofv(acc_space)+qofv(acc_space)*qofv(acc_space) ) +
2348              ( iofv(acc_mark)*iofv(acc_mark)+qofv(acc_mark)*qofv(acc_mark) );
2349      }
2350      return input_size-taps_length+1;
2351  }
2352  
2353  /*
2354    _____        _                                            _
2355   |  __ \      | |                                          (_)
2356   | |  | | __ _| |_ __ _    ___ ___  _ ____   _____ _ __ ___ _  ___  _ __
2357   | |  | |/ _` | __/ _` |  / __/ _ \| '_ \ \ / / _ \ '__/ __| |/ _ \| '_ \
2358   | |__| | (_| | || (_| | | (_| (_) | | | \ V /  __/ |  \__ \ | (_) | | | |
2359   |_____/ \__,_|\__\__,_|  \___\___/|_| |_|\_/ \___|_|  |___/_|\___/|_| |_|
2360  
2361  */
2362  
2363  void convert_u8_f(unsigned char* input, float* output, int input_size)
2364  {
2365      for(int i=0;i<input_size;i++) output[i]=((float)input[i])/(UCHAR_MAX/2.0)-1.0; //@convert_u8_f
2366  }
2367  
2368  void convert_s8_f(signed char* input, float* output, int input_size)
2369  {
2370      for(int i=0;i<input_size;i++) output[i]=((float)input[i])/SCHAR_MAX; //@convert_s8_f
2371  }
2372  
2373  void convert_s16_f(short* input, float* output, int input_size)
2374  {
2375      for(int i=0;i<input_size;i++) output[i]=(float)input[i]/SHRT_MAX; //@convert_s16_f
2376  }
2377  
2378  void convert_f_u8(float* input, unsigned char* output, int input_size)
2379  {
2380      for(int i=0;i<input_size;i++) output[i]=input[i]*UCHAR_MAX*0.5+128; //@convert_f_u8
2381      //128 above is the correct value to add. In any other case a DC component
2382      //of at least -60 dB is shown on the FFT plot after convert_f_u8 -> convert_u8_f
2383  }
2384  
2385  void convert_f_s8(float* input, signed char* output, int input_size)
2386  {
2387      for(int i=0;i<input_size;i++) output[i]=input[i]*SCHAR_MAX; //@convert_f_s8
2388  }
2389  
2390  void convert_f_s16(float* input, short* output, int input_size)
2391  {
2392      /*for(int i=0;i<input_size;i++)
2393      {
2394          if(input[i]>1.0) input[i]=1.0;
2395          if(input[i]<-1.0) input[i]=-1.0;
2396      }*/
2397      for(int i=0;i<input_size;i++) output[i]=input[i]*SHRT_MAX; //@convert_f_s16
2398  }
2399  
2400  void convert_i16_f(short* input, float* output, int input_size) { convert_s16_f(input, output, input_size); }
2401  void convert_f_i16(float* input, short* output, int input_size) { convert_f_s16(input, output, input_size); }
2402  
2403  void convert_f_s24(float* input, unsigned char* output, int input_size, int bigendian)
2404  {
2405      int k=0;
2406      if(bigendian) for(int i=0;i<input_size;i++)
2407      {
2408          int temp=input[i]*(INT_MAX>>8);
2409          unsigned char* ptemp=(unsigned char*)&temp;
2410          output[k++]=*ptemp;
2411          output[k++]=*(ptemp+1);
2412          output[k++]=*(ptemp+2);
2413      }
2414      else for(int i=0;i<input_size;i++)
2415      {
2416          int temp=input[i]*(INT_MAX>>8);
2417          unsigned char* ptemp=(unsigned char*)&temp;
2418          output[k++]=*(ptemp+2);
2419          output[k++]=*(ptemp+1);
2420          output[k++]=*ptemp;
2421      }
2422  }
2423  
2424  void convert_s24_f(unsigned char* input, float* output, int input_size, int bigendian)
2425  {
2426      int k=0;
2427      if(bigendian) for(int i=0;i<input_size*3;i+=3)
2428      {
2429          int temp=(input[i+2]<<24)|(input[i+1]<<16)|(input[i]<<8);
2430          output[k++]=temp/(float)(INT_MAX-256);
2431      }
2432      else for(int i=0;i<input_size*3;i+=3)
2433      {
2434          int temp=(input[i+2]<<8)|(input[i+1]<<16)|(input[i]<<24);
2435          output[k++]=temp/(float)(INT_MAX-256);
2436      }
2437  }
2438  
2439  FILE* init_get_random_samples_f()
2440  {
2441      return fopen("/dev/urandom", "r");
2442  }
2443  
2444  void get_random_samples_f(float* output, int output_size, FILE* status)
2445  {
2446      int* pioutput = (int*)output;
2447      fread((unsigned char*)output, sizeof(float), output_size, status);
2448      for(int i=0;i<output_size;i++)
2449      {
2450          float tempi = pioutput[i];
2451          output[i] = tempi/((float)(INT_MAX)); //*0.82
2452      }
2453  }
2454  
2455  void get_random_gaussian_samples_c(complexf* output, int output_size, FILE* status)
2456  {
2457      int* pioutput = (int*)output;
2458      fread((unsigned char*)output, sizeof(complexf), output_size, status);
2459      for(int i=0;i<output_size;i++)
2460      {
2461          float u1 = 0.5+0.49999999*(((float)pioutput[2*i])/(float)INT_MAX);
2462          float u2 = 0.5+0.49999999*(((float)pioutput[2*i+1])/(float)INT_MAX);
2463          iof(output, i)=sqrt(-2*log(u1))*cos(2*PI*u2);
2464          qof(output, i)=sqrt(-2*log(u1))*sin(2*PI*u2);
2465      }
2466  }
2467  
2468  int deinit_get_random_samples_f(FILE* status)
2469  {
2470      return fclose(status);
2471  }
2472  
2473  int firdes_cosine_f(float* taps, int taps_length, int samples_per_symbol)
2474  {
2475      //needs a taps_length 2 × samples_per_symbol + 1
2476      int middle_i=taps_length/2;
2477      for(int i=0;i<samples_per_symbol;i++) taps[middle_i+i]=taps[middle_i-i]=(1+cos(PI*i/(float)samples_per_symbol))/2;
2478      //for(int i=0;i<taps_length;i++) taps[i]=powf(taps[i],2);
2479      normalize_fir_f(taps, taps, taps_length);
2480  }
2481  
2482  int firdes_rrc_f(float* taps, int taps_length, int samples_per_symbol, float beta)
2483  {
2484      //needs an odd taps_length
2485      int middle_i=taps_length/2;
2486      taps[middle_i]=(1/(float)samples_per_symbol)*(1+beta*(4/PI-1));
2487      for(int i=1;i<1+taps_length/2;i++) 
2488      {
2489          if(i==samples_per_symbol/(4*beta)) 
2490              taps[middle_i+i]=taps[middle_i-i]=(beta/(samples_per_symbol*sqrt(2)))*((1+(2/PI))*sin(PI/(4*beta))+(1-(2/PI))*cos(PI/(4*beta)));
2491          else
2492              taps[middle_i+i]=taps[middle_i-i]=(1/(float)samples_per_symbol)*
2493                  (sin(PI*(i/(float)samples_per_symbol)*(1-beta)) + 4*beta*(i/(float)samples_per_symbol)*cos(PI*(i/(float)samples_per_symbol)*(1+beta)))/
2494                  (PI*(i/(float)samples_per_symbol)*(1-powf(4*beta*(i/(float)samples_per_symbol),2)));
2495      }
2496      normalize_fir_f(taps, taps, taps_length);
2497  }
2498  
2499  void plain_interpolate_cc(complexf* input, complexf* output, int input_size, int interpolation)
2500  {
2501      for(int i=0;i<input_size;i++)
2502      {
2503          output[i*interpolation]=input[i];
2504          bzero(output+(interpolation*i)+1, (interpolation-1)*sizeof(complexf));
2505      }
2506  }
2507  
2508  #define MMATCHEDFILT_GAS(NAME) \
2509      if(!strcmp( #NAME , input )) return MATCHED_FILTER_ ## NAME;
2510  
2511  matched_filter_type_t matched_filter_get_type_from_string(char* input)
2512  {
2513      MMATCHEDFILT_GAS(RRC);
2514      MMATCHEDFILT_GAS(COSINE);
2515      return MATCHED_FILTER_DEFAULT;
2516  }
2517  
2518  float* add_ff(float* input1, float* input2, float* output, int input_size)
2519  {
2520      for(int i=0;i<input_size;i++) output[i]=input1[i]+input2[i];
2521  }
2522  
2523  float* add_const_cc(complexf* input, complexf* output, int input_size, complexf x)
2524  {
2525      for(int i=0;i<input_size;i++)
2526      {
2527          iof(output,i)=iof(input,i)+iofv(x);
2528          qof(output,i)=iof(input,i)+qofv(x);
2529      }
2530  }
2531  
2532  int trivial_vectorize()
2533  {
2534      //this function is trivial to vectorize and should pass on both NEON and SSE
2535      int a[1024], b[1024], c[1024];
2536      for(int i=0; i<1024; i++) //@trivial_vectorize: should pass :-)
2537      {
2538          c[i]=a[i]*b[i];
2539      }
2540      return c[0];
2541  }